先匯入所需的函式庫

library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.3
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.5.3
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.5.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(scales)
## Warning: package 'scales' was built under R version 3.5.3
library(igraph)
## Warning: package 'igraph' was built under R version 3.5.3
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.5.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

匯入古騰堡的西遊記,用stop_words和west.traditional字典初始化結巴引擎,把詞彙斷開

# download book
book <- gutenberg_download(23962)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# clean empty & duplicate line
book <- book %>% filter(text!="") %>% distinct(gutenberg_id, text)
# add chapter number
book <- book %>% mutate(chapter = cumsum(str_detect(book$text, regex("^.*第.*回( |$)"))))
# use Journey to the West dictionary
jieba_tokenizer <- worker(user="west.traditional.txt", stop_word = "stop_words.txt")
# token function
book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

tokens <- book %>% unnest_tokens(word, text, token=book_tokenizer)

挑出西遊記裡的特定妖怪,存到character裡面 畫出個角色的關聯圖

character <- c('野牛精','熊羆精','老虎精','腳力龍馬','黑熊怪','蒼狼怪','白花蛇怪','豬八戒',
             '黃風怪','沙和尚','白骨夫人','黃袍怪','奎星','金角大王','銀角大王','青毛獅子怪',
             '聖嬰大王紅孩兒','鼉龍','虎力大仙','鹿力大仙','羊力大仙','靈感大王','獨角兕大王',
             '如意真仙','風月魔蠍子精','六耳獼猴','鐵扇仙羅剎女','玉面公主','狸精','牛魔王',
             '萬聖龍王','九頭駙馬','九頭蟲','柏樹','檜樹','竹竿','鬆樹','楓樹','杏樹','丹桂',
             '臘梅','黃眉大王','紅鱗大蟒','賽太歲','蜘蛛精','蜈蚣精','青毛獅子怪','黃牙老象',
             '大鵬金翅鵰','白鹿精怪','狐狸精','金鼻白毛老鼠精','艾葉花皮豹子精','鐵背蒼狼怪',
             '九靈元聖','九頭獅子','黃獅精','狻猊獅','搏象獅','白澤獅','伏狸獅','猱獅','雪獅',
             '闢寒大王','闢暑大王','闢塵大王','犀牛精','玉兔','老黿','行者','悟空','孫大聖','大聖',
             '老孫','八戒','悟淨','沙僧','三藏','唐僧','唐三藏','三藏','師父','菩薩')

# keep character word
character_token <- tokens[tokens$word %in% character, ]
# remove duplicate word
character_token <- character_token %>% unique()
# transfer to co-occurrence matrix
occurrence <- crossprod(table(character_token$chapter, character_token$word))
diag(occurrence) <- 0
# transfer co-occurrence matrix to data frame
graph_df <- melt(occurrence)
graph_df <- graph_df[graph_df$value!=0,]
# create network object
network <- graph_from_data_frame(d=graph_df,  directed=F)
# control label size
V(network)$size=degree(network)*0.1
# plot it
plot(network)

可以發現圖有點亂,因為同一角色有時會有不同的名字 像是唐三藏有“唐三藏”、“三藏”、“師父”等名字 故先挑出幾個主要角色,把不同的名字用列舉法統一替換

## remove duplicate word
character_token$word[character_token$word=='豬八戒'] <- '八戒'

character_token$word[character_token$word=='沙和尚'] <- '悟淨'
character_token$word[character_token$word=='悟凈'] <- '悟淨'
character_token$word[character_token$word=='沙僧'] <- '悟淨'

character_token$word[character_token$word=='孫大聖'] <- '悟空'
character_token$word[character_token$word=='大聖'] <- '悟空'
character_token$word[character_token$word=='老孫'] <- '悟空'
character_token$word[character_token$word=='行者'] <- '悟空'

character_token$word[character_token$word=='唐三藏'] <- '三藏'
character_token$word[character_token$word=='唐僧'] <- '三藏'
character_token$word[character_token$word=='師父'] <- '三藏'

occurrence <- crossprod(table(character_token$chapter, character_token$word))
diag(occurrence) <- 0

graph_df <- melt(occurrence)
graph_df <- graph_df[graph_df$value!=0,]

接下來試試看把角色減少試試看,只選出一起出現的次數大於10的角色 角色黃色圓圈的大小,代表出現過的次數多寡 而角色間如果在同一章節出現過越多次,彼此的線就會越粗

# remove the connection that occur too less
graph_df <- graph_df[graph_df$value>10,]
colnames(graph_df) <- c('from', 'to', 'weight')

#unique(c(as.character(graph_df$from), as.character(graph_df$to)))

character_sum <- tokens[tokens$word %in% unique(c(as.character(graph_df$from), as.character(graph_df$to))),] %>% 
  group_by(word) %>% summarise(sum=n())


network <- graph_from_data_frame(d = graph_df, vertices = character_sum, directed=F)

V(network)$size <- log(V(network)$sum)+3

E(network)$width <- log(E(network)$weight)-2
plot(network)

圖中代表的是各個角色彼此一同出現的次數 某種程度上,可以表示為主角群各個角色之間的緊密程度 或主角群們與怪物交手的次數 而圖案中心的黃色節點,為向外延伸出其他角色的核心,則表示故事中的核心人物。

(1)圖案中心:菩薩、三藏、八戒、悟空、悟淨 可以從圖中發現,主角唐三藏、孫悟空、八戒和悟淨都出現在圖案中心 和各個角色的連結都很深,而觀世音菩薩因為常出現幫助師徒一行人 且在故事篇章中頻繁出現,故也是圖案中心發散點之一

(2)中間層:牛魔王、丹桂、黃風怪 圖案第二層是和主角群第二緊密的妖怪 可以發現有大家熟知的牛魔王、丹桂、黃風怪等等

(3)最外層:三位大仙、其他角色 再最外層則是更疏離主角群,但同樣被多次一同提及的怪物 包括車遲國篇章中,因佛道教衝突打起來的的羊力、虎力、鹿力等三位大仙 嗜吃童男童女的靈感大王和雪獅、賽太歲等神仙坐騎等