組員: B054020042 郭宗翰 B064020014 鄭子婷
M084020023 陳靖中 M084020046 葉君良
N074220002 陳柏翔 N074220022 黃姿榕
M084810010 吳曼瑄
require(readr)
## Loading required package: readr
require(tm)
## Loading required package: tm
## Loading required package: NLP
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(stringr)
## Loading required package: stringr
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(udpipe)
## Loading required package: udpipe
require(tidytext)
## Loading required package: tidytext
require(ggplot2)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
require(tidyr)
## Loading required package: tidyr
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(wordcloud2)
## Loading required package: wordcloud2
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
require(purrr)
## Loading required package: purrr
##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
require(ramify)
## Loading required package: ramify
##
## Attaching package: 'ramify'
## The following object is masked from 'package:purrr':
##
## flatten
## The following object is masked from 'package:webshot':
##
## resize
## The following object is masked from 'package:tidyr':
##
## fill
## The following object is masked from 'package:graphics':
##
## clip
require(RColorBrewer)
## Loading required package: RColorBrewer
library(gutenbergr)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
# 下載 "封神演義" ,並將text欄位為空的行清除,以及將重複的語句清除
fengshen_text <- gutenberg_download(23910) %>%
filter(text != "") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
fengshen_text <- fengshen_text %>%
mutate(chapter = cumsum(str_detect(fengshen_text$text, regex("第.*回(\u00A0|$)"))))
jieba_tokenizer <- worker(user = "Fengshen.traditional.dict", stop_word = "stop_words.txt")
fengshen_tokenizer <- function(t){
lapply(t, function(x){
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
fengshen_tokens <- fengshen_text %>% unnest_tokens(word, text, token = fengshen_tokenizer) %>%
filter(nchar(word) > 1)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(chapter, word) %>%
rename(count=n)
fengshen_tokens
## # A tibble: 108,077 x 3
## chapter word count
## <int> <chr> <int>
## 1 1 一人 1
## 2 1 一日 2
## 3 1 一回 2
## 4 1 一見 1
## 5 1 一指 1
## 6 1 一面 1
## 7 1 一首 4
## 8 1 一時 2
## 9 1 一陣 2
## 10 1 一梁 1
## # ... with 108,067 more rows
library(topicmodels)
fengshen_dtm <-fengshen_tokens %>% cast_dtm(chapter, word, count)
fengshen_dtm
## <<DocumentTermMatrix (documents: 100, terms: 38287)>>
## Non-/sparse entries: 108077/3720623
## Sparsity : 97%
## Maximal term length: 8
## Weighting : term frequency (tf)
inspect(fengshen_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 39/61
## Sparsity : 61%
## Maximal term length: 2
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 一人 一日 一回 一見 一指 一面 一首 一時 一陣 一梁
## 1 1 2 2 1 1 1 4 2 2 1
## 10 0 4 0 0 0 0 0 1 0 0
## 2 3 2 0 2 0 0 0 1 1 0
## 3 2 0 0 2 0 0 0 3 0 0
## 4 0 1 0 0 0 0 0 1 0 0
## 5 0 0 0 0 0 0 0 2 0 0
## 6 0 3 0 0 0 0 1 5 0 0
## 7 4 2 3 1 0 0 0 0 0 0
## 8 0 1 0 1 0 0 0 2 0 0
## 9 2 3 0 2 0 2 0 1 4 0
(2,5,10,15,20)個
# ldas = c()
# topics = c(2,5,10,15,20)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(fengshen_dtm, k = topic, control = list(seed = 2020))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result")
# }
load("ldas_result")
topics = c(2,5,10,15,20)
data_frame(k = topics,
perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
產生create LDAvis所需的json function
(上傳github過後依舊跑出空白的網頁)
# topicmodels_json_ldavis <- function(fitted, doc_term){
# require(LDAvis)
# require(slam)
# #
# # ###以下function 用來解決,主題數多會出現NA的問題
# # ## 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
# ls_LDA = function (phi)
# {
# jensenShannon <- function(x, y) {
# m <- 0.5 * (x + y)
# lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
# rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
# 0.5 * sum(lhs) + 0.5 * sum(rhs)
# }
# dist.mat <- proxy::dist(x = phi, method = jensenShannon)
# pca.fit <- stats::cmdscale(dist.mat, k = 2)
# data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
# }
#
# # Find required quantities
# phi <- as.matrix(posterior(fitted)$terms)
# theta <- as.matrix(posterior(fitted)$topics)
# vocab <- colnames(phi)
# term_freq <- slam::col_sums(doc_term)
#
# # Convert to json
# json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
# vocab = vocab,
# doc.length = as.vector(table(doc_term$i)),
# term.frequency = term_freq, mds.method = ls_LDA)
#
# return(json_lda)
# }
# 設置alpha及delta參數
# devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))
# ####### 以下用來產生ldavis的檔案,可以之後用來在local端、放在網路上打開 ##########
# for(lda in ldas){
# k = lda@k ## lda 主題數
# if(k==2){next}
# json_res <- topicmodels_json_ldavis(lda,fengshen_dtm)
# # serVis(json_res,open.browser = T)
# lda_dir = paste0(k,"_ldavis")
# if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}
# serVis(json_res, out.dir =lda_dir, open.browser = F)
# writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
# }
#
# topic_10 = ldas[[3]]
# json_res <- topicmodels_json_ldavis(topic_10,fengshen_dtm)
#
#
#
# serVis(json_res,open.browser = T)
# #
# # # 如果無法開啟視窗(windows用戶)可執行這段
# serVis(json_res, out.dir = "vis", open.browser = T)
# writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))
fengshen_lda = ldas[[3]]
topics <- tidy(fengshen_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics
## # A tibble: 382,870 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 一人 0.000468
## 2 2 一人 0.000324
## 3 3 一人 0.000483
## 4 4 一人 0.000495
## 5 5 一人 0.000379
## 6 6 一人 0.000121
## 7 7 一人 0.000358
## 8 8 一人 0.000553
## 9 9 一人 0.000144
## 10 10 一人 0.000485
## # ... with 382,860 more rows
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
從出現的人物推測主題
topic_name = c('梅山七怪','殷郊與殷洪','蘇護','哪吒出世','聞太師','比干','十絕陣、趙公明','蘇妲己進宮','通天教主','姜子牙設計收鄧九公')
例如:
主題一出現楊戩、張奎、土行孫、元洪
主題四出現李靖、哪吒
theta矩陣
fen_documents <- tidy(fengshen_lda, matrix = "gamma")
fen_documents
## # A tibble: 1,000 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 1 1 0.0000103
## 2 2 1 0.00000504
## 3 3 1 0.00000604
## 4 4 1 0.0000107
## 5 5 1 0.0000110
## 6 6 1 0.00000625
## 7 7 1 0.00000603
## 8 8 1 0.00000466
## 9 9 1 0.00000671
## 10 10 1 0.00000815
## # ... with 990 more rows
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(fengshen_lda)
doc_pro <- tmResult$topics
dim(doc_pro) # nDocs(DTM) distributions over K topics
## [1] 100 10
# get document topic proportions
document_topics <- doc_pro[fengshen_text$chapter,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
fengshen_topic = cbind(fengshen_text,document_topics_df)
fengshen_topic %>% head(10)
## gutenberg_id
## 1 23910
## 2 23910
## 3 23910
## 4 23910
## 5 23910
## 6 23910
## 7 23910
## 8 23910
## 9 23910
## 10 23910
## text chapter
## 1 第一回<U+00A0><U+00A0><U+00A0><U+00A0>紂王女媧宮進香 1
## 2 古風一首: 1
## 3 混沌初分盤古先,太極兩儀四象懸。子天丑地人寅出,避除獸患有巢賢。 1
## 4 燧人取火免鮮食,伏羲畫卦陰陽前。神農治世嚐百草,軒轅禮樂婚姻聯。 1
## 5 少昊五帝民物阜,禹王治水洪波蠲。承平享國至四百,桀王無道乾坤顛, 1
## 6 日縱妹喜荒酒色,成湯造亳洗腥羶,放桀南巢拯暴虐,雲霓如願後蘇全。 1
## 7 三十一世傳殷紂,商家脈絡如斷弦:紊亂朝綱絕倫紀,殺妻誅子信讒言, 1
## 8 穢污宮闈寵妲己,蠆盆炮烙忠貞冤,鹿臺聚斂萬姓苦,愁聲怨氣應障天, 1
## 9 直諫剖心盡焚炙,孕婦刳剔朝涉殲,崇信姦回棄朝政,屏逐師保性何偏, 1
## 10 郊社不修宗廟廢,奇技淫巧盡心研,昵此罪人乃罔畏,沉酗肆虐如鸇鳶。 1
## 梅山七怪 殷郊與殷洪 蘇護 哪吒出世 聞太師
## 1 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 2 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 3 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 4 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 5 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 6 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 7 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 8 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 9 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 10 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 比干 十絕陣、趙公明 蘇妲己進宮 通天教主 姜子牙設計收鄧九公
## 1 1.029237e-05 1.029237e-05 0.9999074 1.029237e-05 1.029237e-05
## 2 1.029237e-05 1.029237e-05 0.9999074 1.029237e-05 1.029237e-05
## 3 1.029237e-05 1.029237e-05 0.9999074 1.029237e-05 1.029237e-05
## 4 1.029237e-05 1.029237e-05 0.9999074 1.029237e-05 1.029237e-05
## 5 1.029237e-05 1.029237e-05 0.9999074 1.029237e-05 1.029237e-05
## 6 1.029237e-05 1.029237e-05 0.9999074 1.029237e-05 1.029237e-05
## 7 1.029237e-05 1.029237e-05 0.9999074 1.029237e-05 1.029237e-05
## 8 1.029237e-05 1.029237e-05 0.9999074 1.029237e-05 1.029237e-05
## 9 1.029237e-05 1.029237e-05 0.9999074 1.029237e-05 1.029237e-05
## 10 1.029237e-05 1.029237e-05 0.9999074 1.029237e-05 1.029237e-05
可以看到第一回每個主題的比例分布,其中以“蘇妲己進宮”比例最高
fengshen_topic[,c(4:13)] =sapply(fengshen_topic[,c(4:13)] , as.numeric)
fengshen_topic %>%
select(-gutenberg_id) %>%
group_by(chapter) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "chapter")%>%
ggplot( aes(x=chapter, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
fengshen_topic[,c(4:13)] =sapply(fengshen_topic[,c(4:13)] , as.numeric)
fengshen_topic %>%
select(-gutenberg_id) %>%
group_by(chapter) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "chapter")%>%
group_by(chapter)%>%
mutate(total_value =sum(value))%>%
ggplot(aes(x=chapter, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") +
ylab("proportion") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
可以發現封神演義的主題分布多為一章節一個主題,只有少數章節會出現兩個主題以上。