西遊記文本分析

##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  541544 29.0    1236600 66.1   621337 33.2
## Vcells 1019235  7.8    8388608 64.0  1600933 12.3

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## [1] ""

載入所需套件

# 載入library
library(gutenbergr)
library(jiebaR)
library(tidyverse)
library(stringr)
library(wordcloud2)
library(tidytext)
library(reshape2)

西遊記詞庫處理

# 詞庫來源:https://shouji.sogou.com/dict.php?cid=34&page=4
# 載入library
library(readr)
library(devtools)
# 解碼scel用
install_github("qinwf/cidian")
library(cidian)
# 簡體轉繁體套件
install_github("qinwf/ropencc")
library(ropencc)

# 解碼搜狗詞庫的scel檔案
decode_scel(scel = "./Journey_to_the_West_lexicon.scel", output ="./Journey_to_the_West_lexicon.scel_dict",cpp = TRUE)

# 讀取解碼後生成的詞庫檔案
scan(file="./Journey_to_the_West_lexicon.scel_dict",
     what=character(),nlines=50,sep='\n',
     encoding='utf-8',fileEncoding='utf-8')

## [1] "?"

dict <- read_file("./Journey_to_the_West_lexicon.scel_dict")
# 將簡體詞庫轉為繁體
cc <- converter(S2TW)
dict_trad <- cc[dict]
write_file(dict_trad, "./Journey_to_the_West_lexicon.traditional.dict")

# 讀取轉換成繁體後的詞庫檔案
scan(file="./Journey_to_the_West_lexicon.traditional.dict",
     what=character(),nlines=50,sep='\n',
     encoding='utf-8',fileEncoding='utf-8')

##  [1] "愛惜飛蛾紗照燈 n"   "愛惜飛蛾紗照燈回 n" "艾葉花皮豹子精 n"  
##  [4] "安樂值錢多 n"       "敖廣 n"             "敖欽 n"            
##  [7] "敖閏 n"             "敖順 n"             "灞波兒奔 n"        
## [10] "芭蕉扇 n"           "八戒 n"             "巴山虎 n"          
## [13] "白骨精 n"           "白龍馬 n"           "白鹿怪 n"          
## [16] "白麵狐狸 n"         "白無常 n"           "百眼魔君 n"        
## [19] "白衣秀士 n"         "白澤獅 n"           "搬運車遲 n"        
## [22] "寶象國捎書 n"       "背凡人重如丘山 n"   "北海龍王 n"        
## [25] "被魔化身 n"         "奔波兒灞 n"         "避不得醒 n"        
## [28] "避風如避箭 n"       "弼馬溫 n"           "避色如避仇 n"      
## [31] "必是是非人 n"       "辨認真邪 n"         "病不討醫 n"        
## [34] "不分男女 n"         "不教而善 n"         "不看僧面看佛面 n"  
## [37] "不冷不熱 n"         "不如本分為人 n"     "不受苦中苦 n"      
## [40] "不死帶傷 n"         "不信直中直 n"       "不醉即飽 n"        
## [43] "財者末也 n"         "蒼蠅包網兒 n"       "草木不生 n"        
## [46] "草木一秋 n"         "曾著賣糖君子哄 n"   "差之毫釐 n"        
## [49] "嫦娥 n"             "長他人之志氣 n"

網路西遊記詞庫的字詞不多，故自行建立西遊記相關詞彙

# 參考資料:https://kknews.cc/zh-tw/culture/9v3aej.html
self_lexicon <-c("唐僧", "三藏","取經","徒弟","繡花針","紅孩兒","悟能","豬八戒","悟空","悟淨",
                  "觀世音菩薩","觀世音","白龍馬","小白龍","西海龍王","佛祖","菩提祖師",
                  "筋斗雲","美猴王","玉皇大帝","王母娘娘","瑤池娘娘","齊天大聖","蟠桃",
                  "太上老君","李名耳","托塔天王","十八羅漢","金丹","青牛精","金鋼琢",
                  "靈吉菩薩","太白金星","芭蕉扇","羅剎女","鐵扇公主","南極壽星","狐狸精",
                  "白鹿精","南極仙翁","弼馬溫","西天取經","獅駝洞","鎮元大仙","鎮元子",
                  "人參果","東來佛祖","彌勒佛","彌勒菩薩","釋迦牟尼","未來佛","二郎真君",
                  "楊名戩","托塔天王","李名靖","托塔李天王","如來佛祖","婆羅門教","文殊菩薩",
                  "五台山","青毛獅","蓮花台","普賢菩薩","峨眉山","四海龍王","海龍王",
                  "敖廣","南海龍王","敖欽","北海龍王","敖順","西海龍王","敖閏","陰曹閻王",
                  "金箍棒","閻羅","李世民","昴日星官","哪吒","敖摩昂","太子","鼉龍","摩昂",
                  "嫦娥","后羿","毗藍婆菩薩","紫雲山千花洞","昴日星官","巨靈神","木叉","木吒",
                  "熊羆怪","黃風大聖","黃風怪","三股鋼叉","黃風嶺","南山大王","黃眉大王",
                  "六耳獼猴","騰雲駕霧","通天河魚怪","紅孩兒","聖嬰大王","牛魔王","善財童子",
                  "獨角兕大王","百眼魔君","多眼怪","黃花觀","八戒","沙僧","沙悟淨","白骨精",
                  "白骨夫人","妖精","金角大王","銀角大王","七星寶劍","紅葫蘆","玉凈瓶",
                  "金繩子","獅魔王","青毛獅子","獅魔","白鹿精","蟠龍","蠍子精","黃袍怪",
                  "奎木狼","九頭怪","獅駝洞老妖","獅駝洞二怪","獅駝洞三怪","獅駝洞","紅蟒精",
                  "玄英洞","辟寒大王","辟暑大王","辟塵大王","盤絲洞","蜘蛛精","七仙姑",
                  "玉兔精","天竺國","老鼠精","半截觀音","地涌夫人","虎力大仙","鹿力大仙",
                  "羊力大仙","齊天大聖","西遊記","唐三藏","老孫","美猴王","孫行者","孫大聖","貧僧")

設定結巴斷詞，並載入西遊記的斷詞字典

jieba_tokenizer = worker(user = "Journey_to_the_West_lexicon.traditional.dict", stop_word = "stop_words.txt")
new_user_word(jieba_tokenizer, self_lexicon)

## [1] TRUE

從古騰堡計畫下載西遊記的文本資料

West_Journey <- gutenberg_download(23962) %>%
  filter(text != "") %>% 
  distinct(gutenberg_id, text)

新增章節欄位

West_Journey <- West_Journey %>%
  mutate(chapter = cumsum(str_detect(West_Journey$text, regex("^第.*回|^ 第.*回"))))
# 由於第83回在最前面多了一個空格，故在新增章節欄位時要再加上一個條件"^ 第.*回"。

設定斷詞函數，並將文本進行斷詞

WestJourney_tokenizer <- function (t) {
  lapply(t, function(x){
    tokens <- segment(x, jieba_tokenizer)
    # tokens <- tokens[nchar(tokens) > 1]
    return(tokens)
  })
}

# 斷詞
tokens <- West_Journey %>%
  unnest_tokens(word, text, token = WestJourney_tokenizer)

目標一 :

計算詞彙的出現次數

tokens_count <- tokens %>% 
  filter(nchar(.$word) > 1) %>%  #只挑選一個字以上的字詞
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  # filter(sum > 50) %>%
  arrange(desc(sum))

summary(tokens_count)

##      word                sum          
##  Length:46203       Min.   :   1.000  
##  Class :character   1st Qu.:   1.000  
##  Mode  :character   Median :   1.000  
##                     Mean   :   3.947  
##                     3rd Qu.:   2.000  
##                     Max.   :3931.000

文字雲

wordcloud2(tokens_count, size = 1)

統計出現次數大於500的字

tokens %>%
  filter(nchar(.$word) > 1) %>%
  count(word, sort = TRUE) %>%
  filter(n > 500) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  ylab("出現次數") +
  coord_flip()

目標二 :

各章節長度，以語句數來計算

plot <- bind_rows(West_Journey %>% 
                    group_by(chapter) %>% 
                    summarise(count = n(), type="sentences"),
                  tokens %>% 
                    group_by(chapter) %>% 
                    summarise(count = n(), type="words")) %>% 
  group_by(type)%>%
  ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
  geom_line(size = 1) + 
  ggtitle("各章節的句子總數") + 
  xlab("章節") + 
  ylab("句子數量")

plot

目標三 :

計算角色在各章節中出現的頻率

# 角色同意字整理

# 孫悟空
wukong <- c("老孫", "孫悟空","悟空","孫大聖","石猴","美猴王","弼馬溫","齊天大聖","行者","妖猴","心猿","大師兄","孫行者")
# 唐三僧
tangseng <- c("師父", "唐僧", "三藏", "唐三藏", "唐長老", "聖僧", "玄奘", "高僧","貧僧")
# 沙悟淨
wujing <- c("沙僧", "悟淨", "沙悟淨")
# 豬八戒
bajie <- c("八戒", "悟能", "豬剛", "老豬", "豬八戒")
# 妖精
evil <- c("白骨精","白骨夫人","妖精","金角大王","銀角大王","獅魔王","青毛獅子","獅魔","白鹿精","蟠龍","蠍子精",
          "黃袍怪","奎木狼","九頭怪","獅駝洞老妖","獅駝洞二怪","獅駝洞三怪","紅蟒精","辟寒大王","辟暑大王",
          "辟塵大王","蜘蛛精","玉兔精","老鼠精","半截觀音","地涌夫人","虎力大仙","虎力","鹿力大仙","鹿力",
          "羊力大仙","羊力","羅剎女","鐵扇公主","牛魔王","妖精","邪怪")

# 計算角色在各章節中出現的頻率
wukong_num <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% wukong) %>%
  group_by(chapter) %>%  
  summarise('孫悟空' = n())

tangseng_num <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% tangseng) %>%
  group_by(chapter) %>%  
  summarise('唐三藏' = n())

wujing_num <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% wujing) %>%
  group_by(chapter) %>%  
  summarise('沙悟淨' = n())

bajie_num <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% bajie) %>%
  group_by(chapter) %>%  
  summarise('豬八戒' = n())

evil_num <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% evil) %>%
  group_by(chapter) %>%  
  summarise('妖精們' = n())

# 合併資料
character_data <- left_join(West_Journey %>% distinct(chapter), wukong_num, by = c("chapter"))
character_data <- left_join(character_data, tangseng_num, by = c("chapter"))
character_data <- left_join(character_data, wujing_num, by = c("chapter"))
character_data <- left_join(character_data, bajie_num, by = c("chapter"))
character_data <- left_join(character_data, evil_num, by = c("chapter"))
character_data[is.na(character_data)] <- 0
character_data <- melt(character_data,id.vars = "chapter") %>% arrange(chapter)

# 畫圖
ggplot(character_data, aes(x=chapter, y=value, fill=as.factor(variable)))+
  geom_bar(stat = "identity", width = 0.5)+
  facet_wrap(~variable, ncol = 1)+
  ggtitle("各章節中主要角色出現頻率") + 
  xlab("章節") + 
  ylab("出現次數")+
  theme(legend.title = element_blank())+ # 刪除圖例標題
  scale_x_continuous(breaks = seq(0, 100, by = 5))

目標四 :

分析各章節情緒，以正負面字詞判斷

讀取LIWC字典

# 正向字典txt檔
P <- read_file("positive.txt")
# 負向字典txt檔
N <- read_file("negative.txt")

#將字串依","分割 (strsplit回傳list , 我們取出list中的第一個元素)
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")

LIWC = rbind(P, N)

文集中的字出現在LIWC字典中是屬於positive還是negative

# 與LIWC情緒字典做join
sentiment_data <- tokens %>% inner_join(LIWC)

#以LIWC情緒字典分析，統計各章節的正負面字詞次數
# Lollipop plots 
sentiment_count = sentiment_data %>%
  group_by(chapter,sentiment) %>%
  summarise(count=n()) %>% 
  spread(.,key = sentiment,value = count) %>% 
  mutate(negative = -negative)

ggplot(sentiment_count,aes(x=chapter,y=count))+
  geom_segment( aes(x=chapter, xend=chapter, y=`positive`, yend=`negative`),size=1,color="grey50") +
  geom_point( aes(x=chapter, y=`positive`), color="#DE7E73", size=2 ) + #紅色是正面情緒
  geom_point( aes(x=chapter, y=`negative`), color="#84B1ED", size=2 ) + #藍色世負面情緒
  coord_flip()+
  scale_x_continuous(breaks = seq(0, 100, by = 5))+
  ggtitle("各章節中正負面字詞出現次數") + 
  xlab("章節") + 
  ylab("出現次數")

# Bar plots
sentiment_count = sentiment_data %>%
  group_by(chapter,sentiment) %>%
  summarise(count=n()) 

ggplot(sentiment_count,aes(x=chapter,y=count, fill=sentiment))+
  geom_bar(stat = "identity", size = 1, width = 0.7)+
  facet_wrap(~sentiment, ncol = 1)+
  scale_x_continuous(breaks = seq(0, 100, by = 5))+
  ggtitle("各章節中正負面字詞出現次數") + 
  xlab("章節") + 
  ylab("出現次數")

西遊記文本分析

社群媒體分析-第六組

2020/3/31

系統參數設定

載入所需套件

西遊記詞庫處理

網路西遊記詞庫的字詞不多，故自行建立西遊記相關詞彙

設定結巴斷詞，並載入西遊記的斷詞字典

從古騰堡計畫下載西遊記的文本資料

新增章節欄位

設定斷詞函數，並將文本進行斷詞

目標一 :

計算詞彙的出現次數

文字雲

統計出現次數大於500的字

目標二 :

各章節長度，以語句數來計算

目標三 :

計算角色在各章節中出現的頻率

目標四 :

分析各章節情緒，以正負面字詞判斷

讀取LIWC字典

文集中的字出現在LIWC字典中是屬於positive還是negative