組員:
B054020042 郭宗翰 B064020014 鄭子婷
M084020023 陳靖中 M084020046 葉君良
N074220002 陳柏翔 N074220022 黃姿榕
M084810010 吳曼瑄

主題:封神演義

載入package

require(readr)
## Loading required package: readr
require(tm)
## Loading required package: tm
## Loading required package: NLP
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(stringr)
## Loading required package: stringr
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(udpipe)
## Loading required package: udpipe
require(tidytext)
## Loading required package: tidytext
require(ggplot2)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
require(tidyr)
## Loading required package: tidyr
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(wordcloud2)
## Loading required package: wordcloud2
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
require(purrr)
## Loading required package: purrr
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
## 
##     transpose
require(ramify)
## Loading required package: ramify
## 
## Attaching package: 'ramify'
## The following object is masked from 'package:purrr':
## 
##     flatten
## The following object is masked from 'package:webshot':
## 
##     resize
## The following object is masked from 'package:tidyr':
## 
##     fill
## The following object is masked from 'package:graphics':
## 
##     clip
require(RColorBrewer)
## Loading required package: RColorBrewer
library(gutenbergr)
require(widyr)
## Loading required package: widyr
require(reshape2)
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20) 

從古騰堡下載“封神演義”

# 下載 "封神演義" ,並將text欄位為空的行清除,以及將重複的語句清除
fengshen_text <- gutenberg_download(23910) %>% 
  filter(text != "") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org

斷出章節

fengshen_text <- fengshen_text %>% 
  mutate(chapter = cumsum(str_detect(fengshen_text$text, regex("第.*回(\u00A0|$)"))))

Jieba斷詞、停用字

jieba_tokenizer <- worker(user = "Fengshen.traditional.dict", stop_word = "stop_words.txt")

fengshen_tokenizer <- function(t){
  lapply(t, function(x){
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
fengshen_tokens <- fengshen_text %>% unnest_tokens(word, text, token = fengshen_tokenizer) %>%
  filter(nchar(word) > 1)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(chapter, word) %>%
  rename(count=n)

fengshen_same = c("子牙","姜子牙")
fengshen_tokens$word[which(fengshen_tokens$word %in%fengshen_same)] = "姜子牙"
fengshen_tokens
## # A tibble: 108,077 x 3
##    chapter word  count
##      <int> <chr> <int>
##  1       1 一人      1
##  2       1 一回      2
##  3       1 一指      1
##  4       1 一日      2
##  5       1 一時      2
##  6       1 一梁      1
##  7       1 一見      1
##  8       1 一道      1
##  9       1 一陣      2
## 10       1 一面      1
## # … with 108,067 more rows

匯入封神演義角色

people <- read.table("./people.traditional.txt",sep='\n',encoding='utf-8',fileEncoding='utf-8')
names(people)<-c("person")

每個章節出現的角色

chapter_person <- fengshen_tokens %>% filter(fengshen_tokens$word %in% people$person)%>% unique() 
names(chapter_person)<-c("chapter","person","count")
chapter_person <- chapter_person %>% select(chapter,person)
chapter_person
## # A tibble: 1,435 x 2
##    chapter person
##      <int> <chr> 
##  1       1 太師  
##  2       1 妲己  
##  3       1 武王  
##  4       1 殷洪  
##  5       1 殷郊  
##  6       1 紂王  
##  7       1 聞仲  
##  8       1 聞太師
##  9       1 鄂崇禹
## 10       1 黃飛虎
## # … with 1,425 more rows

每個章節出現的兩個角色

因為要節點是人物,edge是有共同出現在同一章

person_person <- inner_join(x = chapter_person, y = chapter_person, by = "chapter") %>% filter(person.x != person.y)
person_person
## # A tibble: 23,784 x 3
##    chapter person.x person.y
##      <int> <chr>    <chr>   
##  1       1 太師     妲己    
##  2       1 太師     武王    
##  3       1 太師     殷洪    
##  4       1 太師     殷郊    
##  5       1 太師     紂王    
##  6       1 太師     聞仲    
##  7       1 太師     聞太師  
##  8       1 太師     鄂崇禹  
##  9       1 太師     黃飛虎  
## 10       1 妲己     太師    
## # … with 23,774 more rows
link <- person_person %>% group_by(person.x,person.y) %>% summarise(count = n()) 

人物與人物的網路圖

Network <- graph_from_data_frame(d=link, directed=T)
Network
## IGRAPH 8dfb67f DN-- 101 5182 -- 
## + attr: name (v/c), count (e/n)
## + edges from 8dfb67f (vertex names):
##  [1] 丘引  ->哪吒     丘引  ->土行孫   丘引  ->多寶道人 丘引  ->太乙真人
##  [5] 丘引  ->太師     丘引  ->姜子牙   丘引  ->孫焰紅   丘引  ->廣成子  
##  [9] 丘引  ->張桂芳   丘引  ->文王     丘引  ->李靖     丘引  ->武吉    
## [13] 丘引  ->武王     丘引  ->火靈聖母 丘引  ->申公豹   丘引  ->紂王    
## [17] 丘引  ->聞仲     丘引  ->聞太師   丘引  ->通天教主 丘引  ->鄧九公  
## [21] 丘引  ->鄧嬋玉   丘引  ->金靈聖母 丘引  ->韋護     丘引  ->飛虎    
## [25] 丘引  ->高明     丘引  ->黃飛虎   伯邑考->元始天尊 伯邑考->妲己    
## [29] 伯邑考->姜子牙   伯邑考->崇黑虎   伯邑考->崔英     伯邑考->文王    
## + ... omitted several edges
# 畫出網路圖
plot(Network,vertex.size = 2,vertex.label.cex = .7,vertex.label.dist = 1,edge.arrow.size = 0.4)

設為無向圖

Network<-as.undirected(Network, mode = "collapse")
plot(Network,vertex.size = 2,vertex.label.cex = .7,vertex.label.dist = 1)

太多角色和連結了,只能看出有部分角色屬於中心角色與大部分人物都有連結,而在外圍的角色則與較靠近的節點為較常一起出現的。

找出中心角色

找出一起出現超過20章的人物

link_main <- person_person %>% group_by(person.x,person.y) %>% summarise(count = n()) 
link_main <- link_main %>% filter(count >=20)

人物與人物的網路圖

Network_main <- graph_from_data_frame(d=link_main, directed=T)
Network_main
## IGRAPH f526de7 DN-- 40 160 -- 
## + attr: name (v/c), count (e/n)
## + edges from f526de7 (vertex names):
##  [1] 元始天尊->姜子牙       南極仙翁->姜子牙       哪吒    ->土行孫      
##  [4] 哪吒    ->姜子牙       哪吒    ->木吒         哪吒    ->李靖        
##  [7] 哪吒    ->楊戩         哪吒    ->武吉         哪吒    ->武王        
## [10] 哪吒    ->申公豹       哪吒    ->紂王         哪吒    ->聞太師      
## [13] 哪吒    ->雷震子       哪吒    ->韋護         哪吒    ->黃天化      
## [16] 哪吒    ->黃飛虎       土行孫  ->哪吒         土行孫  ->姜子牙      
## [19] 土行孫  ->武王         太乙真人->姜子牙       太師    ->姜子牙      
## [22] 太師    ->聞太師       妲己    ->姜子牙       妲己    ->紂王        
## + ... omitted several edges
# 畫出網路圖
Network_main<-as.undirected(Network_main, mode = "collapse")
plot(Network_main,vertex.size = 2,vertex.label.cex = .7,vertex.label.dist = 1,edge.arrow.size = 0.4)

可以發現姜子牙為封神演義的中心角色,許多人物都與他會共同出現。而哪吒、武王也能算另外的中心人物。