系統參數設定
# 避免中文亂碼
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## [1] ""
載入套件
library(dplyr)
library(tidytext)
library(jiebaR)
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(ggplot2)
library(tidyr)
library(scales)
library(readr)
從Gutenberg下載西遊記文本(23962)
#將text欄位為空的行給清除,以及將重複的語句清除
journey <- gutenberg_download(23962) %>%
filter(text!="") %>%
distinct(gutenberg_id, text)
資料前處理
#觀察文本資料,發現章節標題的格式以"第X回"開頭
#使用正規表示式,將句子區分章節
journey <- journey %>%
mutate(chapter = cumsum(str_detect(journey$text, regex("^第.*回|^ 第.*回"))))
#完成斷句
str(journey)
## Classes 'tbl_df', 'tbl' and 'data.frame': 23328 obs. of 3 variables:
## $ gutenberg_id: int 23962 23962 23962 23962 23962 23962 23962 23962 23962 23962 ...
## $ text : chr "第一回 靈根育孕源流出 心性修持大道生" " 詩曰:" " 混沌未分天地亂,茫茫渺渺無人見。" " 自從盤古破鴻濛,開闢從茲清濁辨。" ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
確認章節回數
max(journey$chapter)
## [1] 100
初始化Jieba斷詞引擎
#使用西遊記專有名詞字典
jieba_tokenizer = worker(user="Journey.traditional.dict")
# 設定斷詞function
journey_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
進行斷詞
tokens <- journey %>%
unnest_tokens(word, text, token=journey_tokenizer)
#完成斷詞
str(tokens)
## Classes 'tbl_df', 'tbl' and 'data.frame': 376112 obs. of 3 variables:
## $ gutenberg_id: int 23962 23962 23962 23962 23962 23962 23962 23962 23962 23962 ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr "第一回" "靈根育孕" "源流" "出" ...
計算斷詞後各詞彙的出現次數
#若詞彙只有一個字則不列入計算
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 列出最常見的20個詞彙
head(tokens_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 行者 3932
## 2 八戒 1615
## 3 師父 1543
## 4 三藏 1282
## 5 一個 1054
## 6 唐僧 963
## 7 大聖 877
## 8 怎麼 738
## 9 我們 700
## 10 沙僧 700
## 11 菩薩 691
## 12 那裏 675
## 13 不知 642
## 14 和尚 629
## 15 妖精 612
## 16 兩個 582
## 17 笑道 552
## 18 甚麼 535
## 19 長老 502
## 20 不是 484
畫出文字雲
tokens_count %>% wordcloud2()
以語句數計算各章節長度
plot <-
bind_rows(
journey %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line(size = 1) +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量")
plot

從搜狗下載西遊記詞庫
# 來源:https://shouji.sogou.com/dict.php?cid=34&page=4
# 安裝處理詞庫所需的套件
packages = c("readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# 載入library
library(readr)
library(devtools)
# 解碼scel用
#install_github("qinwf/cidian")
library(cidian)
# 簡體轉繁體套件
#install_github("qinwf/ropencc")
library(ropencc)
## [1] "?"
## [1] "愛惜飛蛾紗照燈 n" "愛惜飛蛾紗照燈回 n" "艾葉花皮豹子精 n"
## [4] "安樂值錢多 n" "敖廣 n" "敖欽 n"
## [7] "敖閏 n" "敖順 n" "灞波兒奔 n"
## [10] "芭蕉扇 n" "八戒 n" "巴山虎 n"
## [13] "白骨精 n" "白龍馬 n" "白鹿怪 n"
## [16] "白麵狐狸 n" "白無常 n" "百眼魔君 n"
## [19] "白衣秀士 n" "白澤獅 n" "搬運車遲 n"
## [22] "寶象國捎書 n" "背凡人重如丘山 n" "北海龍王 n"
## [25] "被魔化身 n" "奔波兒灞 n" "避不得醒 n"
## [28] "避風如避箭 n" "弼馬溫 n" "避色如避仇 n"
## [31] "必是是非人 n" "辨認真邪 n" "病不討醫 n"
## [34] "不分男女 n" "不教而善 n" "不看僧面看佛面 n"
## [37] "不冷不熱 n" "不如本分為人 n" "不受苦中苦 n"
## [40] "不死帶傷 n" "不信直中直 n" "不醉即飽 n"
## [43] "財者末也 n" "蒼蠅包網兒 n" "草木不生 n"
## [46] "草木一秋 n" "曾著賣糖君子哄 n" "差之毫釐 n"
## [49] "嫦娥 n" "長他人之志氣 n"
計算主要人物出現的章回與頻率
# 定義角色等同詞,並存成變數
# 悟空
wukong_alias = c("老孫", "孫悟空","悟空","孫大聖","石猴","美猴王","弼馬溫","齊天大聖","行者","妖猴","心猿","大師兄")
# 唐僧
tangseng_alias = c("師父", "唐僧", "三藏", "唐三藏", "唐長老", "聖僧", "玄奘", "高僧")
# 沙悟淨
wujing_alias = c("沙僧", "悟淨", "和尚")
# 豬八戒
bajie_alias = c("八戒", "悟能", "豬剛", "老豬")
孫悟空
# 計算悟空出現的頻率及比例
wukong_count = tokens %>%
filter(nchar(.$word)>1) %>% # 保留非空白資料
mutate(total = nrow(tokens)) %>% # 資料總列數
group_by(word) %>%
filter(word %in% wukong_alias) %>% # 保留悟空的等同詞
summarise(count = n(), total = mean(total)) %>% # 計算群組內的數量
mutate(proportion = count / total) %>% # 比例
arrange(desc(count)) # 根據出現次數由大至小排列
head(wukong_count, 12)
## # A tibble: 12 x 4
## word count total proportion
## <chr> <int> <dbl> <dbl>
## 1 行者 3932 376112 0.0105
## 2 悟空 381 376112 0.00101
## 3 老孫 320 376112 0.000851
## 4 孫大聖 191 376112 0.000508
## 5 孫悟空 115 376112 0.000306
## 6 齊天大聖 94 376112 0.000250
## 7 弼馬溫 47 376112 0.000125
## 8 美猴王 41 376112 0.000109
## 9 妖猴 32 376112 0.0000851
## 10 心猿 13 376112 0.0000346
## 11 大師兄 11 376112 0.0000292
## 12 石猴 2 376112 0.00000532
# 計算悟空在各章節中出現的頻率
wukong_plot = tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% wukong_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的悟空出現總數") +
xlab("章節") +
ylab("悟空數量")
wukong_plot

唐僧
# 計算唐僧出現的頻率及比例
tangseng_count = tokens %>%
filter(nchar(.$word)>1) %>% # 保留非空白資料
mutate(total = nrow(tokens)) %>% # 資料總列數
group_by(word) %>%
filter(word %in% tangseng_alias) %>% # 保留唐僧的等同詞
summarise(count = n(), total = mean(total)) %>% # 計算群組內的數量
mutate(proportion = count / total) %>% # 比例
arrange(desc(count)) # 根據出現次數由大至小排列
head(tangseng_count, 8)
## # A tibble: 8 x 4
## word count total proportion
## <chr> <int> <dbl> <dbl>
## 1 師父 1543 376112 0.00410
## 2 三藏 1282 376112 0.00341
## 3 唐僧 963 376112 0.00256
## 4 聖僧 141 376112 0.000375
## 5 玄奘 65 376112 0.000173
## 6 唐三藏 54 376112 0.000144
## 7 唐長老 18 376112 0.0000479
## 8 高僧 16 376112 0.0000425
# 計算唐僧在各章節中出現的頻率
tangseng_plot = tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% tangseng_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的唐僧出現總數") +
xlab("章節") +
ylab("唐僧數量")
tangseng_plot

沙悟淨
# 計算沙悟淨出現的頻率及比例
wujing_count = tokens %>%
filter(nchar(.$word)>1) %>% # 保留非空白資料
mutate(total = nrow(tokens)) %>% # 資料總列數
group_by(word) %>%
filter(word %in% wujing_alias) %>% # 保留沙悟淨的等同詞
summarise(count = n(), total = mean(total)) %>% # 計算群組內的數量
mutate(proportion = count / total) %>% # 比例
arrange(desc(count)) # 根據出現次數由大至小排列
head(wujing_count)
## # A tibble: 3 x 4
## word count total proportion
## <chr> <int> <dbl> <dbl>
## 1 沙僧 700 376112 0.00186
## 2 和尚 629 376112 0.00167
## 3 悟淨 52 376112 0.000138
# 計算沙悟淨在各章節中出現的頻率
wujing_plot = tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% wujing_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的沙悟淨出現總數") +
xlab("章節") +
ylab("沙悟淨數量")
wujing_plot

豬八戒
# 計算豬八戒出現的頻率及比例
bajie_count = tokens %>%
filter(nchar(.$word)>1) %>% # 保留非空白資料
mutate(total = nrow(tokens)) %>% # 資料總列數
group_by(word) %>%
filter(word %in% bajie_alias) %>% # 保留豬八戒的等同詞
summarise(count = n(), total = mean(total)) %>% # 計算群組內的數量
mutate(proportion = count / total) %>% # 比例
arrange(desc(count)) # 根據出現次數由大至小排列
head(bajie_count)
## # A tibble: 4 x 4
## word count total proportion
## <chr> <int> <dbl> <dbl>
## 1 八戒 1615 376112 0.00429
## 2 老豬 84 376112 0.000223
## 3 豬剛 5 376112 0.0000133
## 4 悟能 3 376112 0.00000798
# 計算豬八戒在各章節中出現的頻率
bajie_plot = tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% bajie_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的豬八戒出現總數") +
xlab("章節") +
ylab("豬八戒數量")
bajie_plot

各主角出現章回綜合比較
wukong <- tokens %>%
filter(word %in% wukong_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "悟空")
tangseng <- tokens %>%
filter(word %in% tangseng_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "唐僧")
wujing <- tokens %>%
filter(word %in% wujing_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "沙悟淨")
bajie <- tokens %>%
filter(word %in% bajie_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
mutate(word = "豬八戒")
bind_rows(wukong, tangseng, wujing, bajie) %>%
ggplot(aes(x = chapter, y=count, fill=word)) +
geom_col(show.legend = F) +
facet_wrap(~word, ncol = 1) +
ggtitle("各主角比較") +
xlab("章節") +
ylab("出現次數")

準備LIWC字典
# 正向字典txt檔
P <- read_file("positive.txt")
# 負向字典txt檔
N <- read_file("negative.txt")
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word, sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
判斷文集中的字在LIWC字典中屬於Positive還是Negative
tokens <- tokens %>% #combine chapter, word, sum from wtokens
select(chapter, word) %>%
filter(nchar(.$word) >1) %>%
group_by(chapter, word) %>%
summarise(sum = n()) %>%
arrange(desc(sum))
# 與LIWC情緒字典做join
tokens_count %>% inner_join(LIWC)
## # A tibble: 78 x 3
## word sum sentiment
## <chr> <int> <fct>
## 1 寶貝 253 positive
## 2 不要 203 negative
## 3 不好 143 negative
## 4 仔細 102 positive
## 5 放心 91 positive
## 6 可憐 78 negative
## 7 保護 68 positive
## 8 無禮 60 negative
## 9 煩惱 57 negative
## 10 答應 56 positive
## # ... with 68 more rows
tokens %>%
select(word) %>%
inner_join(LIWC)
## # A tibble: 2,923 x 3
## # Groups: chapter [100]
## chapter word sentiment
## <int> <chr> <fct>
## 1 34 寶貝 positive
## 2 38 寶貝 positive
## 3 33 寶貝 positive
## 4 35 寶貝 positive
## 5 63 寶貝 positive
## 6 52 寶貝 positive
## 7 37 寶貝 positive
## 8 16 寶貝 positive
## 9 60 寶貝 positive
## 10 70 寶貝 positive
## # ... with 2,913 more rows
統計每一回正面字與負面字的次數
sentiment_count <- tokens %>%
select(chapter, word, sum) %>%
inner_join(LIWC) %>%
group_by(chapter, sentiment) %>%
summarise(count = sum(sum))
sentiment_count %>%
ggplot() +
geom_line(aes(x = chapter, y = count, colour = sentiment), size = 1) +
# 較多負面大於正面
geom_vline(aes(xintercept = as.numeric(chapter[which(sentiment_count$chapter == 31)
[1]])),colour = "black") +
geom_vline(aes(xintercept = as.numeric(chapter[which(sentiment_count$chapter == 56)
[1]])),colour = "black") +
# 較多正面大於負面
geom_vline(aes(xintercept = as.numeric(chapter[which(sentiment_count$chapter == 34)
[1]])),colour = "black") +
geom_vline(aes(xintercept = as.numeric(chapter[which(sentiment_count$chapter == 63)
[1]])),colour = "black")
