系統參數設定
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼## [1] ""
安裝需要的packages
# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)讀進library
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
require(jiebaR)df <- fread("data/bibble.txt", encoding = "UTF-8",fill=TRUE)bible <- df %>%
mutate(bookcode = cumsum(str_detect(df$book,regex("^=[0-1][0-9]{2}")))) %>% select (-book,-chapter)
#格式是=第1碼是0或1,0:舊約, 1:新約, 第2-3碼為流水號 設定斷詞引擎—自定義聖經專有名詞字典(先知的名字)
# 加入自定義的字典
bible_jieba_tokenizer <- worker(user="data/bible_lexicon.tradictional_2.txt", stop_word = "data/bible_stop_words.txt")
#設定斷詞function
bible_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, bible_jieba_tokenizer)
return(tokens)
})
}bible_tokens <- bible %>% unnest_tokens(word, text, token=bible_tokenizer)# 分成新舊約
bible_select <- bible_tokens %>%
filter(nchar(.$word)>1 | .$word =="神" | .$word =="主" )
bible_tokens_count_old = bible_tokens %>%
filter(nchar(.$word)>1 | .$word =="神" | .$word =="主" ) %>%
filter(.$bookcode<40)%>%
count(word, sort = TRUE)
bible_tokens_count_new = bible_tokens %>%
filter(nchar(.$word)>1 | .$word =="神"| .$word =="主" )%>%
filter(.$bookcode>=40)%>%
count(word, sort = TRUE)全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版 分為正向情緒與負向情緒
P <- read_file("data/positive.txt") # 正向字典txt檔
N <- read_file("data/negative.txt") # 負向字典txt檔
#字典txt檔讀進來是一整個字串# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
★主要是以章節作為探討
找出文集中,對於LIWC字典是positive和negative的字
sentiment_count = bible_select %>%
select(bookcode,word) %>%
inner_join(LIWC) %>%
group_by(bookcode,sentiment) %>%
summarise(count=n()) 畫出每個章節的情緒總分數,可以看到大概在第40章(新約聖經),情緒從負面為主轉為正面為主。
# 檢視資料的章節區間
range(sentiment_count$bookcode) #1~66## [1] 1 66
sentiment_count %>%
ggplot()+
geom_line(aes(x=bookcode,y=count,colour=sentiment))+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count$bookcode == 40)
[1]])),colour = "red") >將情緒分數標準化後再畫一次圖,我們能發現在新約聖經中,正面情緒是大於負面情緒的,舊約的部分則是正負面情緒各半。
sentiment_count %>%
# 標準化
group_by(bookcode) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=bookcode,y=ratio,colour=sentiment))+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(bookcode[which(sentiment_count$bookcode == 40)
[1]])),colour = "red") # 查看每章的情緒分數排名
sentiment_count %>%
select(count,bookcode) %>%
group_by(bookcode) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))## # A tibble: 66 x 2
## bookcode sum
## <int> <int>
## 1 19 1855
## 2 23 1074
## 3 24 983
## 4 20 962
## 5 26 698
## 6 18 588
## 7 42 437
## 8 3 390
## 9 5 389
## 10 14 377
## # ... with 56 more rows
#第19章為正面情緒最高;第23章則為負面情緒最高