組員:
B054020042 郭宗翰 B064020014 鄭子婷
M084020023 陳靖中 M084020046 葉君良
N074220002 陳柏翔 N074220022 黃姿榕
M084810010 吳曼瑄
require(readr)
## Loading required package: readr
require(tm)
## Loading required package: tm
## Loading required package: NLP
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(stringr)
## Loading required package: stringr
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(udpipe)
## Loading required package: udpipe
require(tidytext)
## Loading required package: tidytext
require(ggplot2)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
require(tidyr)
## Loading required package: tidyr
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(wordcloud2)
## Loading required package: wordcloud2
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
require(purrr)
## Loading required package: purrr
##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
require(ramify)
## Loading required package: ramify
##
## Attaching package: 'ramify'
## The following object is masked from 'package:purrr':
##
## flatten
## The following object is masked from 'package:webshot':
##
## resize
## The following object is masked from 'package:tidyr':
##
## fill
## The following object is masked from 'package:graphics':
##
## clip
require(RColorBrewer)
## Loading required package: RColorBrewer
library(gutenbergr)
require(widyr)
## Loading required package: widyr
require(reshape2)
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
# 下載 "封神演義" ,並將text欄位為空的行清除,以及將重複的語句清除
fengshen_text <- gutenberg_download(23910) %>%
filter(text != "") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
fengshen_text <- fengshen_text %>%
mutate(chapter = cumsum(str_detect(fengshen_text$text, regex("第.*回(\u00A0|$)"))))
jieba_tokenizer <- worker(user = "Fengshen.traditional.dict", stop_word = "stop_words.txt")
fengshen_tokenizer <- function(t){
lapply(t, function(x){
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
fengshen_tokens <- fengshen_text %>% unnest_tokens(word, text, token = fengshen_tokenizer) %>%
filter(nchar(word) > 1)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(chapter, word) %>%
rename(count=n)
fengshen_same = c("子牙","姜子牙")
fengshen_tokens$word[which(fengshen_tokens$word %in%fengshen_same)] = "姜子牙"
fengshen_tokens
## # A tibble: 108,077 x 3
## chapter word count
## <int> <chr> <int>
## 1 1 一人 1
## 2 1 一回 2
## 3 1 一指 1
## 4 1 一日 2
## 5 1 一時 2
## 6 1 一梁 1
## 7 1 一見 1
## 8 1 一道 1
## 9 1 一陣 2
## 10 1 一面 1
## # … with 108,067 more rows
people <- read.table("./people.traditional.txt",sep='\n',encoding='utf-8',fileEncoding='utf-8')
names(people)<-c("person")
chapter_person <- fengshen_tokens %>% filter(fengshen_tokens$word %in% people$person)%>% unique()
names(chapter_person)<-c("chapter","person","count")
chapter_person <- chapter_person %>% select(chapter,person)
chapter_person
## # A tibble: 1,435 x 2
## chapter person
## <int> <chr>
## 1 1 太師
## 2 1 妲己
## 3 1 武王
## 4 1 殷洪
## 5 1 殷郊
## 6 1 紂王
## 7 1 聞仲
## 8 1 聞太師
## 9 1 鄂崇禹
## 10 1 黃飛虎
## # … with 1,425 more rows
因為要節點是人物,edge是有共同出現在同一章
person_person <- inner_join(x = chapter_person, y = chapter_person, by = "chapter") %>% filter(person.x != person.y)
person_person
## # A tibble: 23,784 x 3
## chapter person.x person.y
## <int> <chr> <chr>
## 1 1 太師 妲己
## 2 1 太師 武王
## 3 1 太師 殷洪
## 4 1 太師 殷郊
## 5 1 太師 紂王
## 6 1 太師 聞仲
## 7 1 太師 聞太師
## 8 1 太師 鄂崇禹
## 9 1 太師 黃飛虎
## 10 1 妲己 太師
## # … with 23,774 more rows
link <- person_person %>% group_by(person.x,person.y) %>% summarise(count = n())
Network <- graph_from_data_frame(d=link, directed=T)
Network
## IGRAPH 8dfb67f DN-- 101 5182 --
## + attr: name (v/c), count (e/n)
## + edges from 8dfb67f (vertex names):
## [1] 丘引 ->哪吒 丘引 ->土行孫 丘引 ->多寶道人 丘引 ->太乙真人
## [5] 丘引 ->太師 丘引 ->姜子牙 丘引 ->孫焰紅 丘引 ->廣成子
## [9] 丘引 ->張桂芳 丘引 ->文王 丘引 ->李靖 丘引 ->武吉
## [13] 丘引 ->武王 丘引 ->火靈聖母 丘引 ->申公豹 丘引 ->紂王
## [17] 丘引 ->聞仲 丘引 ->聞太師 丘引 ->通天教主 丘引 ->鄧九公
## [21] 丘引 ->鄧嬋玉 丘引 ->金靈聖母 丘引 ->韋護 丘引 ->飛虎
## [25] 丘引 ->高明 丘引 ->黃飛虎 伯邑考->元始天尊 伯邑考->妲己
## [29] 伯邑考->姜子牙 伯邑考->崇黑虎 伯邑考->崔英 伯邑考->文王
## + ... omitted several edges
# 畫出網路圖
plot(Network,vertex.size = 2,vertex.label.cex = .7,vertex.label.dist = 1,edge.arrow.size = 0.4)
Network<-as.undirected(Network, mode = "collapse")
plot(Network,vertex.size = 2,vertex.label.cex = .7,vertex.label.dist = 1)
太多角色和連結了,只能看出有部分角色屬於中心角色與大部分人物都有連結,而在外圍的角色則與較靠近的節點為較常一起出現的。
找出一起出現超過20章的人物
link_main <- person_person %>% group_by(person.x,person.y) %>% summarise(count = n())
link_main <- link_main %>% filter(count >=20)
Network_main <- graph_from_data_frame(d=link_main, directed=T)
Network_main
## IGRAPH f526de7 DN-- 40 160 --
## + attr: name (v/c), count (e/n)
## + edges from f526de7 (vertex names):
## [1] 元始天尊->姜子牙 南極仙翁->姜子牙 哪吒 ->土行孫
## [4] 哪吒 ->姜子牙 哪吒 ->木吒 哪吒 ->李靖
## [7] 哪吒 ->楊戩 哪吒 ->武吉 哪吒 ->武王
## [10] 哪吒 ->申公豹 哪吒 ->紂王 哪吒 ->聞太師
## [13] 哪吒 ->雷震子 哪吒 ->韋護 哪吒 ->黃天化
## [16] 哪吒 ->黃飛虎 土行孫 ->哪吒 土行孫 ->姜子牙
## [19] 土行孫 ->武王 太乙真人->姜子牙 太師 ->姜子牙
## [22] 太師 ->聞太師 妲己 ->姜子牙 妲己 ->紂王
## + ... omitted several edges
# 畫出網路圖
Network_main<-as.undirected(Network_main, mode = "collapse")
plot(Network_main,vertex.size = 2,vertex.label.cex = .7,vertex.label.dist = 1,edge.arrow.size = 0.4)
可以發現姜子牙為封神演義的中心角色,許多人物都與他會共同出現。而哪吒、武王也能算另外的中心人物。