R Markdown

安裝需要的packages

# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

讀進library

require(dplyr)
require(tidytext)
require(jiebaR)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
require(data.table)
library(stringr)
library(reshape2)
library(wordcloud)
library(readr)
library(openxlsx)

把檔案讀進來

data <- fread("../data/booksx_2.txt", encoding = "UTF-8",fill=TRUE)

bible <- data %>% 
  mutate(bookcode = cumsum(str_detect(data$book,regex("^=[0-1][0-9]{2}")))) %>%   
  select (-book,-chapter)  %>%
  filter(!(data$ch_book %in% c("-")))  #去除特殊的格式 
str(bible)

## Classes 'data.table' and 'data.frame':   31172 obs. of  4 variables:
##  $ ch_book   : chr  "創世紀" "創世紀" "創世紀" "創世紀" ...
##  $ ch_chapter: chr  "1:1" "1:2" "1:3" "1:4" ...
##  $ text      : chr  "起初　神創造天地。" "地是空虛混沌．淵面黑暗．　神的靈運行在水面上。" "　神說、要有光、就有了光。" "　神看光是好的、就把光暗分開了。" ...
##  $ bookcode  : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

# 分成新舊約
bible$novel <- ifelse(bible$bookcode < 40,"old","new")

1. 資料前處理

(1). 文章斷詞

設定斷詞引擎

# 加入自定義的字典
jieba_tokenizer <- worker(user="bible_lexicon.tradictional_2.txt", stop_word = "bible_stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
# 斷詞
bible_tokens <- bible %>% unnest_tokens(word, text, token=customized_tokenizer)

(2). 資料基本清理

去除特殊字元、詞頻太低的字

# 過濾特殊字元
bible_select = bible_tokens %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
  filter(nchar(.$word)>1 | .$word =="神" | .$word =="主" |.$word =="信"|.$word =="望"|.$word =="愛")

word_count <- bible_select %>%
  group_by(ch_book,word) %>%
  summarise(count=n()) %>%  # 算字詞單篇總數用summarise
  filter(count>10) %>%  # 過濾出現太少次的字
  arrange(desc(count))

## `summarise()` has grouped output by 'ch_book'. You can override using the `.groups` argument.

word_count

## # A tibble: 3,044 x 3
## # Groups:   ch_book [61]
##    ch_book  word   count
##    <chr>    <chr>  <int>
##  1 詩篇     耶和華   754
##  2 耶利米書 耶和華   737
##  3 申命記   耶和華   536
##  4 歷代志上 兒子     505
##  5 以賽亞書 耶和華   484
##  6 以西結書 耶和華   438
##  7 詩篇     神       436
##  8 出埃及記 耶和華   413
##  9 歷代志下 耶和華   398
## 10 民數記   耶和華   394
## # … with 3,034 more rows

文字雲

# 舊約 耶和華、以色列、神、兒子出現最多

bible_select %>% 
  filter(novel == "old") %>% 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>100)%>%wordcloud2()

# 新約 ˇ以耶穌、神、基督、主最多

bible_select %>% 
  filter(novel == "new") %>% 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>100)%>%wordcloud2()

2. 準備NRC字典

分為正向情緒與負向情緒

讀檔，字詞間以“,”將字分隔

P <- read_file("../data/dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("../data/dict/liwc/negative.txt") # 負向字典txt檔 
NRC <- read.xlsx("../data/dict/liwc/NRC.xlsx")

#字典txt檔讀進來是一整個字串
typeof(P)

## [1] "character"

# 檢視字典
head(NRC)

##   word sentiment
## 1 放棄  negative
## 2   棄  negative
## 3 放棄  negative
## 4 ABBA  positive
## 5 綁架  negative
## 6 異常  negative

3. 將聖經和NRC情緒字典做join

在畫出情緒之前，先看看每章的數量情形，以19回的詩篇句數最多。

正負情緒發文折線圖

bible %>%
  group_by(bookcode) %>%
  summarise(count = n()) %>%
  ggplot()+
    geom_line(aes(x=bookcode,y=count))

bible[bible$bookcode == 19,"ch_book"]

##       ch_book
##    1:    詩篇
##    2:    詩篇
##    3:    詩篇
##    4:    詩篇
##    5:    詩篇
##   ---        
## 2458:    詩篇
## 2459:    詩篇
## 2460:    詩篇
## 2461:    詩篇
## 2462:

找出文集中，對於NRC字典是positive和negative的字

算出每回情緒總和(sentiment_count)

sentiment_count = bible_select %>%
  inner_join(NRC) %>% 
  group_by(ch_book,bookcode,novel,sentiment) %>%
  summarise(count=n())

## Joining, by = "word"

## `summarise()` has grouped output by 'ch_book', 'bookcode', 'novel'. You can override using the `.groups` argument.

檢視資料的章節區間

可以看出舊約負面情緒較高，新約正面情緒較高

range(sentiment_count$bookcode) #1~66

## [1]  1 66

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=bookcode,y=count,colour=sentiment))+
  # 加上標示新舊約分界的線
  geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count$bookcode == 40)
[1]])),colour = "red")

正負情緒比例折線圖

將情緒標準化再畫一次圖，我們能發現在新約聖經中，正面情緒是大於負面情緒的，舊約的部分則是正負面情緒各半。

sentiment_count %>% 
  # 標準化的部分
  group_by(bookcode) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=bookcode,y=ratio,colour=sentiment))+
  # 加上標示日期的線
  geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count$bookcode == 40)
[1]])),colour = "red")

查看每章的情緒分數排名

#第19章為正負面情緒最高
sentiment_count %>%
  group_by(bookcode,sentiment) %>%
  summarise(sum = sum(count)) %>%
  arrange(desc(sum))

## `summarise()` has grouped output by 'bookcode'. You can override using the `.groups` argument.

## # A tibble: 132 x 3
## # Groups:   bookcode [66]
##    bookcode sentiment   sum
##       <int> <chr>     <int>
##  1       19 positive   2972
##  2       23 positive   1659
##  3       19 negative   1441
##  4       26 positive   1425
##  5       24 negative   1319
##  6        5 positive   1304
##  7       44 positive   1199
##  8       23 negative   1196
##  9        1 positive   1178
## 10       42 positive   1133
## # … with 122 more rows

比較正負情緒在新舊上的差異

可以看出在舊約情緒用詞不管是正負面都較新約高，兩者的正面情緒皆與負面情緒相差兩倍之多

sentiment_count %>% 
  ggplot(aes(x = novel, y = count, fill = sentiment)) + 
  geom_bar(stat = "identity", position = "dodge")+
  theme(text = element_text(family = "Heiti TC Light"))

hw5