## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 541544 29.0 1236600 66.1 621337 33.2
## Vcells 1019235 7.8 8388608 64.0 1600933 12.3
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] ""
# 載入library
library(gutenbergr)
library(jiebaR)
library(tidyverse)
library(stringr)
library(wordcloud2)
library(tidytext)
library(reshape2)
# 詞庫來源:https://shouji.sogou.com/dict.php?cid=34&page=4
# 載入library
library(readr)
library(devtools)
# 解碼scel用
install_github("qinwf/cidian")
library(cidian)
# 簡體轉繁體套件
install_github("qinwf/ropencc")
library(ropencc)
# 解碼搜狗詞庫的scel檔案
decode_scel(scel = "./Journey_to_the_West_lexicon.scel", output ="./Journey_to_the_West_lexicon.scel_dict",cpp = TRUE)
# 讀取解碼後生成的詞庫檔案
scan(file="./Journey_to_the_West_lexicon.scel_dict",
what=character(),nlines=50,sep='\n',
encoding='utf-8',fileEncoding='utf-8')
## [1] "?"
dict <- read_file("./Journey_to_the_West_lexicon.scel_dict")
# 將簡體詞庫轉為繁體
cc <- converter(S2TW)
dict_trad <- cc[dict]
write_file(dict_trad, "./Journey_to_the_West_lexicon.traditional.dict")
# 讀取轉換成繁體後的詞庫檔案
scan(file="./Journey_to_the_West_lexicon.traditional.dict",
what=character(),nlines=50,sep='\n',
encoding='utf-8',fileEncoding='utf-8')
## [1] "愛惜飛蛾紗照燈 n" "愛惜飛蛾紗照燈回 n" "艾葉花皮豹子精 n"
## [4] "安樂值錢多 n" "敖廣 n" "敖欽 n"
## [7] "敖閏 n" "敖順 n" "灞波兒奔 n"
## [10] "芭蕉扇 n" "八戒 n" "巴山虎 n"
## [13] "白骨精 n" "白龍馬 n" "白鹿怪 n"
## [16] "白麵狐狸 n" "白無常 n" "百眼魔君 n"
## [19] "白衣秀士 n" "白澤獅 n" "搬運車遲 n"
## [22] "寶象國捎書 n" "背凡人重如丘山 n" "北海龍王 n"
## [25] "被魔化身 n" "奔波兒灞 n" "避不得醒 n"
## [28] "避風如避箭 n" "弼馬溫 n" "避色如避仇 n"
## [31] "必是是非人 n" "辨認真邪 n" "病不討醫 n"
## [34] "不分男女 n" "不教而善 n" "不看僧面看佛面 n"
## [37] "不冷不熱 n" "不如本分為人 n" "不受苦中苦 n"
## [40] "不死帶傷 n" "不信直中直 n" "不醉即飽 n"
## [43] "財者末也 n" "蒼蠅包網兒 n" "草木不生 n"
## [46] "草木一秋 n" "曾著賣糖君子哄 n" "差之毫釐 n"
## [49] "嫦娥 n" "長他人之志氣 n"
# 參考資料:https://kknews.cc/zh-tw/culture/9v3aej.html
self_lexicon <-c("唐僧", "三藏","取經","徒弟","繡花針","紅孩兒","悟能","豬八戒","悟空","悟淨",
"觀世音菩薩","觀世音","白龍馬","小白龍","西海龍王","佛祖","菩提祖師",
"筋斗雲","美猴王","玉皇大帝","王母娘娘","瑤池娘娘","齊天大聖","蟠桃",
"太上老君","李名耳","托塔天王","十八羅漢","金丹","青牛精","金鋼琢",
"靈吉菩薩","太白金星","芭蕉扇","羅剎女","鐵扇公主","南極壽星","狐狸精",
"白鹿精","南極仙翁","弼馬溫","西天取經","獅駝洞","鎮元大仙","鎮元子",
"人參果","東來佛祖","彌勒佛","彌勒菩薩","釋迦牟尼","未來佛","二郎真君",
"楊名戩","托塔天王","李名靖","托塔李天王","如來佛祖","婆羅門教","文殊菩薩",
"五台山","青毛獅","蓮花台","普賢菩薩","峨眉山","四海龍王","海龍王",
"敖廣","南海龍王","敖欽","北海龍王","敖順","西海龍王","敖閏","陰曹閻王",
"金箍棒","閻羅","李世民","昴日星官","哪吒","敖摩昂","太子","鼉龍","摩昂",
"嫦娥","后羿","毗藍婆菩薩","紫雲山千花洞","昴日星官","巨靈神","木叉","木吒",
"熊羆怪","黃風大聖","黃風怪","三股鋼叉","黃風嶺","南山大王","黃眉大王",
"六耳獼猴","騰雲駕霧","通天河魚怪","紅孩兒","聖嬰大王","牛魔王","善財童子",
"獨角兕大王","百眼魔君","多眼怪","黃花觀","八戒","沙僧","沙悟淨","白骨精",
"白骨夫人","妖精","金角大王","銀角大王","七星寶劍","紅葫蘆","玉凈瓶",
"金繩子","獅魔王","青毛獅子","獅魔","白鹿精","蟠龍","蠍子精","黃袍怪",
"奎木狼","九頭怪","獅駝洞老妖","獅駝洞二怪","獅駝洞三怪","獅駝洞","紅蟒精",
"玄英洞","辟寒大王","辟暑大王","辟塵大王","盤絲洞","蜘蛛精","七仙姑",
"玉兔精","天竺國","老鼠精","半截觀音","地涌夫人","虎力大仙","鹿力大仙",
"羊力大仙","齊天大聖","西遊記","唐三藏","老孫","美猴王","孫行者","孫大聖","貧僧")
jieba_tokenizer = worker(user = "Journey_to_the_West_lexicon.traditional.dict", stop_word = "stop_words.txt")
new_user_word(jieba_tokenizer, self_lexicon)
## [1] TRUE
West_Journey <- gutenberg_download(23962) %>%
filter(text != "") %>%
distinct(gutenberg_id, text)
West_Journey <- West_Journey %>%
mutate(chapter = cumsum(str_detect(West_Journey$text, regex("^第.*回|^ 第.*回"))))
# 由於第83回在最前面多了一個空格,故在新增章節欄位時要再加上一個條件"^ 第.*回"。
WestJourney_tokenizer <- function (t) {
lapply(t, function(x){
tokens <- segment(x, jieba_tokenizer)
# tokens <- tokens[nchar(tokens) > 1]
return(tokens)
})
}
# 斷詞
tokens <- West_Journey %>%
unnest_tokens(word, text, token = WestJourney_tokenizer)
tokens_count <- tokens %>%
filter(nchar(.$word) > 1) %>% #只挑選一個字以上的字詞
group_by(word) %>%
summarise(sum = n()) %>%
# filter(sum > 50) %>%
arrange(desc(sum))
summary(tokens_count)
## word sum
## Length:46203 Min. : 1.000
## Class :character 1st Qu.: 1.000
## Mode :character Median : 1.000
## Mean : 3.947
## 3rd Qu.: 2.000
## Max. :3931.000
wordcloud2(tokens_count, size = 1)
tokens %>%
filter(nchar(.$word) > 1) %>%
count(word, sort = TRUE) %>%
filter(n > 500) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
ylab("出現次數") +
coord_flip()
plot <- bind_rows(West_Journey %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line(size = 1) +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量")
plot
# 角色同意字整理
# 孫悟空
wukong <- c("老孫", "孫悟空","悟空","孫大聖","石猴","美猴王","弼馬溫","齊天大聖","行者","妖猴","心猿","大師兄","孫行者")
# 唐三僧
tangseng <- c("師父", "唐僧", "三藏", "唐三藏", "唐長老", "聖僧", "玄奘", "高僧","貧僧")
# 沙悟淨
wujing <- c("沙僧", "悟淨", "沙悟淨")
# 豬八戒
bajie <- c("八戒", "悟能", "豬剛", "老豬", "豬八戒")
# 妖精
evil <- c("白骨精","白骨夫人","妖精","金角大王","銀角大王","獅魔王","青毛獅子","獅魔","白鹿精","蟠龍","蠍子精",
"黃袍怪","奎木狼","九頭怪","獅駝洞老妖","獅駝洞二怪","獅駝洞三怪","紅蟒精","辟寒大王","辟暑大王",
"辟塵大王","蜘蛛精","玉兔精","老鼠精","半截觀音","地涌夫人","虎力大仙","虎力","鹿力大仙","鹿力",
"羊力大仙","羊力","羅剎女","鐵扇公主","牛魔王","妖精","邪怪")
# 計算角色在各章節中出現的頻率
wukong_num <- tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% wukong) %>%
group_by(chapter) %>%
summarise('孫悟空' = n())
tangseng_num <- tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% tangseng) %>%
group_by(chapter) %>%
summarise('唐三藏' = n())
wujing_num <- tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% wujing) %>%
group_by(chapter) %>%
summarise('沙悟淨' = n())
bajie_num <- tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% bajie) %>%
group_by(chapter) %>%
summarise('豬八戒' = n())
evil_num <- tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% evil) %>%
group_by(chapter) %>%
summarise('妖精們' = n())
# 合併資料
character_data <- left_join(West_Journey %>% distinct(chapter), wukong_num, by = c("chapter"))
character_data <- left_join(character_data, tangseng_num, by = c("chapter"))
character_data <- left_join(character_data, wujing_num, by = c("chapter"))
character_data <- left_join(character_data, bajie_num, by = c("chapter"))
character_data <- left_join(character_data, evil_num, by = c("chapter"))
character_data[is.na(character_data)] <- 0
character_data <- melt(character_data,id.vars = "chapter") %>% arrange(chapter)
# 畫圖
ggplot(character_data, aes(x=chapter, y=value, fill=as.factor(variable)))+
geom_bar(stat = "identity", width = 0.5)+
facet_wrap(~variable, ncol = 1)+
ggtitle("各章節中主要角色出現頻率") +
xlab("章節") +
ylab("出現次數")+
theme(legend.title = element_blank())+ # 刪除圖例標題
scale_x_continuous(breaks = seq(0, 100, by = 5))
# 正向字典txt檔
P <- read_file("positive.txt")
# 負向字典txt檔
N <- read_file("negative.txt")
#將字串依","分割 (strsplit回傳list , 我們取出list中的第一個元素)
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
# 與LIWC情緒字典做join
sentiment_data <- tokens %>% inner_join(LIWC)
#以LIWC情緒字典分析,統計各章節的正負面字詞次數
# Lollipop plots
sentiment_count = sentiment_data %>%
group_by(chapter,sentiment) %>%
summarise(count=n()) %>%
spread(.,key = sentiment,value = count) %>%
mutate(negative = -negative)
ggplot(sentiment_count,aes(x=chapter,y=count))+
geom_segment( aes(x=chapter, xend=chapter, y=`positive`, yend=`negative`),size=1,color="grey50") +
geom_point( aes(x=chapter, y=`positive`), color="#DE7E73", size=2 ) + #紅色是正面情緒
geom_point( aes(x=chapter, y=`negative`), color="#84B1ED", size=2 ) + #藍色世負面情緒
coord_flip()+
scale_x_continuous(breaks = seq(0, 100, by = 5))+
ggtitle("各章節中正負面字詞出現次數") +
xlab("章節") +
ylab("出現次數")
# Bar plots
sentiment_count = sentiment_data %>%
group_by(chapter,sentiment) %>%
summarise(count=n())
ggplot(sentiment_count,aes(x=chapter,y=count, fill=sentiment))+
geom_bar(stat = "identity", size = 1, width = 0.7)+
facet_wrap(~sentiment, ncol = 1)+
scale_x_continuous(breaks = seq(0, 100, by = 5))+
ggtitle("各章節中正負面字詞出現次數") +
xlab("章節") +
ylab("出現次數")