安裝需要的packages
# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
讀進library
require(dplyr)
require(tidytext)
require(jiebaR)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
require(data.table)
library(stringr)
library(reshape2)
library(wordcloud)
library(readr)
library(openxlsx)
data <- fread("../data/booksx_2.txt", encoding = "UTF-8",fill=TRUE)
bible <- data %>%
mutate(bookcode = cumsum(str_detect(data$book,regex("^=[0-1][0-9]{2}")))) %>%
select (-book,-chapter) %>%
filter(!(data$ch_book %in% c("-"))) #去除特殊的格式
str(bible)
## Classes 'data.table' and 'data.frame': 31172 obs. of 4 variables:
## $ ch_book : chr "創世紀" "創世紀" "創世紀" "創世紀" ...
## $ ch_chapter: chr "1:1" "1:2" "1:3" "1:4" ...
## $ text : chr "起初 神創造天地。" "地是空虛混沌.淵面黑暗. 神的靈運行在水面上。" " 神說、要有光、就有了光。" " 神看光是好的、就把光暗分開了。" ...
## $ bookcode : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# 分成新舊約
bible$novel <- ifelse(bible$bookcode < 40,"old","new")
(1). 文章斷詞
設定斷詞引擎
# 加入自定義的字典
jieba_tokenizer <- worker(user="bible_lexicon.tradictional_2.txt", stop_word = "bible_stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
# 斷詞
bible_tokens <- bible %>% unnest_tokens(word, text, token=customized_tokenizer)
(2). 資料基本清理
# 過濾特殊字元
bible_select = bible_tokens %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1 | .$word =="神" | .$word =="主" |.$word =="信"|.$word =="望"|.$word =="愛")
word_count <- bible_select %>%
group_by(ch_book,word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>10) %>% # 過濾出現太少次的字
arrange(desc(count))
## `summarise()` has grouped output by 'ch_book'. You can override using the `.groups` argument.
word_count
## # A tibble: 3,044 x 3
## # Groups: ch_book [61]
## ch_book word count
## <chr> <chr> <int>
## 1 詩篇 耶和華 754
## 2 耶利米書 耶和華 737
## 3 申命記 耶和華 536
## 4 歷代志上 兒子 505
## 5 以賽亞書 耶和華 484
## 6 以西結書 耶和華 438
## 7 詩篇 神 436
## 8 出埃及記 耶和華 413
## 9 歷代志下 耶和華 398
## 10 民數記 耶和華 394
## # … with 3,034 more rows
# 舊約 耶和華、以色列、神、兒子出現最多
bible_select %>%
filter(novel == "old") %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>100)%>%wordcloud2()
# 新約 ˇ以耶穌、神、基督、主最多
bible_select %>%
filter(novel == "new") %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>100)%>%wordcloud2()
分為正向情緒與負向情緒
讀檔,字詞間以“,”將字分隔
P <- read_file("../data/dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("../data/dict/liwc/negative.txt") # 負向字典txt檔
NRC <- read.xlsx("../data/dict/liwc/NRC.xlsx")
#字典txt檔讀進來是一整個字串
typeof(P)
## [1] "character"
# 檢視字典
head(NRC)
## word sentiment
## 1 放棄 negative
## 2 棄 negative
## 3 放棄 negative
## 4 ABBA positive
## 5 綁架 negative
## 6 異常 negative
在畫出情緒之前,先看看每章的數量情形,以19回的詩篇句數最多。
bible %>%
group_by(bookcode) %>%
summarise(count = n()) %>%
ggplot()+
geom_line(aes(x=bookcode,y=count))
bible[bible$bookcode == 19,"ch_book"]
## ch_book
## 1: 詩篇
## 2: 詩篇
## 3: 詩篇
## 4: 詩篇
## 5: 詩篇
## ---
## 2458: 詩篇
## 2459: 詩篇
## 2460: 詩篇
## 2461: 詩篇
## 2462:
找出文集中,對於NRC字典是positive和negative的字
算出每回情緒總和(sentiment_count)
sentiment_count = bible_select %>%
inner_join(NRC) %>%
group_by(ch_book,bookcode,novel,sentiment) %>%
summarise(count=n())
## Joining, by = "word"
## `summarise()` has grouped output by 'ch_book', 'bookcode', 'novel'. You can override using the `.groups` argument.
可以看出舊約負面情緒較高,新約正面情緒較高
range(sentiment_count$bookcode) #1~66
## [1] 1 66
sentiment_count %>%
ggplot()+
geom_line(aes(x=bookcode,y=count,colour=sentiment))+
# 加上標示新舊約分界的線
geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count$bookcode == 40)
[1]])),colour = "red")
將情緒標準化再畫一次圖,我們能發現在新約聖經中,正面情緒是大於負面情緒的,舊約的部分則是正負面情緒各半。
sentiment_count %>%
# 標準化的部分
group_by(bookcode) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=bookcode,y=ratio,colour=sentiment))+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count$bookcode == 40)
[1]])),colour = "red")
#第19章為正負面情緒最高
sentiment_count %>%
group_by(bookcode,sentiment) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))
## `summarise()` has grouped output by 'bookcode'. You can override using the `.groups` argument.
## # A tibble: 132 x 3
## # Groups: bookcode [66]
## bookcode sentiment sum
## <int> <chr> <int>
## 1 19 positive 2972
## 2 23 positive 1659
## 3 19 negative 1441
## 4 26 positive 1425
## 5 24 negative 1319
## 6 5 positive 1304
## 7 44 positive 1199
## 8 23 negative 1196
## 9 1 positive 1178
## 10 42 positive 1133
## # … with 122 more rows
可以看出在舊約情緒用詞不管是正負面都較新約高,兩者的正面情緒皆與負面情緒相差兩倍之多
sentiment_count %>%
ggplot(aes(x = novel, y = count, fill = sentiment)) +
geom_bar(stat = "identity", position = "dodge")+
theme(text = element_text(family = "Heiti TC Light"))