組員: B054020042 郭宗翰 B064020014 鄭子婷
M084020023 陳靖中 M084020046 葉君良
N074220002 陳柏翔 N074220022 黃姿榕
M084810010 吳曼瑄

載入package

require(readr)

## Loading required package: readr

require(tm)

## Loading required package: tm

## Loading required package: NLP

require(data.table)

## Loading required package: data.table

require(dplyr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(stringr)

## Loading required package: stringr

require(jiebaR)

## Loading required package: jiebaR

## Loading required package: jiebaRD

require(udpipe)

## Loading required package: udpipe

require(tidytext)

## Loading required package: tidytext

require(ggplot2)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

require(tidyr)

## Loading required package: tidyr

require(topicmodels)

## Loading required package: topicmodels

require(LDAvis)

## Loading required package: LDAvis

require(wordcloud2)

## Loading required package: wordcloud2

require(webshot)

## Loading required package: webshot

require(htmlwidgets)

## Loading required package: htmlwidgets

require(servr)

## Loading required package: servr

require(purrr)

## Loading required package: purrr

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:data.table':
## 
##     transpose

require(ramify)

## Loading required package: ramify

## 
## Attaching package: 'ramify'

## The following object is masked from 'package:purrr':
## 
##     flatten

## The following object is masked from 'package:webshot':
## 
##     resize

## The following object is masked from 'package:tidyr':
## 
##     fill

## The following object is masked from 'package:graphics':
## 
##     clip

require(RColorBrewer)

## Loading required package: RColorBrewer

library(gutenbergr)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

從古騰堡下載“封神演義”

# 下載 "封神演義" ，並將text欄位為空的行清除，以及將重複的語句清除
fengshen_text <- gutenberg_download(23910) %>% 
  filter(text != "") %>% distinct(gutenberg_id, text)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

斷出章節

fengshen_text <- fengshen_text %>% 
  mutate(chapter = cumsum(str_detect(fengshen_text$text, regex("第.*回(\u00A0|$)"))))

Jieba斷詞、停用字

jieba_tokenizer <- worker(user = "Fengshen.traditional.dict", stop_word = "stop_words.txt")

fengshen_tokenizer <- function(t){
  lapply(t, function(x){
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

fengshen_tokens <- fengshen_text %>% unnest_tokens(word, text, token = fengshen_tokenizer) %>%
  filter(nchar(word) > 1)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(chapter, word) %>%
  rename(count=n)

fengshen_tokens

## # A tibble: 108,077 x 3
##    chapter word  count
##      <int> <chr> <int>
##  1       1 一人      1
##  2       1 一日      2
##  3       1 一回      2
##  4       1 一見      1
##  5       1 一指      1
##  6       1 一面      1
##  7       1 一首      4
##  8       1 一時      2
##  9       1 一陣      2
## 10       1 一梁      1
## # ... with 108,067 more rows

將資料轉換為Document Term Matrix (DTM)

library(topicmodels)
fengshen_dtm <-fengshen_tokens %>% cast_dtm(chapter, word, count)
fengshen_dtm

## <<DocumentTermMatrix (documents: 100, terms: 38287)>>
## Non-/sparse entries: 108077/3720623
## Sparsity           : 97%
## Maximal term length: 8
## Weighting          : term frequency (tf)

inspect(fengshen_dtm[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 39/61
## Sparsity           : 61%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 一人 一日 一回 一見 一指 一面 一首 一時 一陣 一梁
##   1     1    2    2    1    1    1    4    2    2    1
##   10    0    4    0    0    0    0    0    1    0    0
##   2     3    2    0    2    0    0    0    1    1    0
##   3     2    0    0    2    0    0    0    3    0    0
##   4     0    1    0    0    0    0    0    1    0    0
##   5     0    0    0    0    0    0    0    2    0    0
##   6     0    3    0    0    0    0    1    5    0    0
##   7     4    2    3    1    0    0    0    0    0    0
##   8     0    1    0    1    0    0    0    2    0    0
##   9     2    3    0    2    0    2    0    1    4    0

建立主題模型

(2,5,10,15,20)個

# ldas = c()
# topics = c(2,5,10,15,20)
# for(topic in topics){
#   start_time <- Sys.time()
#   lda <- LDA(fengshen_dtm, k = topic, control = list(seed = 2020))
#   ldas =c(ldas,lda)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(ldas,file = "ldas_result")
# }

載入模型

load("ldas_result")

透過perplexity找到最佳主題數

topics = c(2,5,10,15,20)
data_frame(k = topics,
           perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

LDAvis

產生create LDAvis所需的json function
(上傳github過後依舊跑出空白的網頁)

# topicmodels_json_ldavis <- function(fitted, doc_term){
#     require(LDAvis)
#     require(slam)
# #   
# #    ###以下function 用來解決，主題數多會出現NA的問題
# #   ## 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
# ls_LDA = function (phi)
# {
#   jensenShannon <- function(x, y) {
#       m <- 0.5 * (x + y)
#     lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
#     rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
#     0.5 * sum(lhs) + 0.5 * sum(rhs)
#   }
#   dist.mat <- proxy::dist(x = phi, method = jensenShannon)
#   pca.fit <- stats::cmdscale(dist.mat, k = 2)
#   data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
# }
# 
#     # Find required quantities
#     phi <- as.matrix(posterior(fitted)$terms)
#     theta <- as.matrix(posterior(fitted)$topics)
#     vocab <- colnames(phi)
#     term_freq <- slam::col_sums(doc_term)
# 
#     # Convert to json
#     json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
#                             vocab = vocab,
#                             doc.length = as.vector(table(doc_term$i)),
#                             term.frequency = term_freq, mds.method = ls_LDA)
# 
#     return(json_lda)
# }

# 設置alpha及delta參數
# devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))

# ####### 以下用來產生ldavis的檔案，可以之後用來在local端、放在網路上打開 ##########
# for(lda in ldas){
#   k = lda@k ## lda 主題數
#   if(k==2){next}
#   json_res <- topicmodels_json_ldavis(lda,fengshen_dtm)
#   # serVis(json_res,open.browser = T)
#   lda_dir =  paste0(k,"_ldavis")
#   if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}
#   serVis(json_res, out.dir =lda_dir, open.browser = F)
#   writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
# }
# 
# topic_10 = ldas[[3]]
# json_res <- topicmodels_json_ldavis(topic_10,fengshen_dtm)
# 
# 
# 
# serVis(json_res,open.browser = T)
# # 
# # # 如果無法開啟視窗(windows用戶)可執行這段
# serVis(json_res, out.dir = "vis", open.browser = T)
# writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

選topic數為10的結果

fengshen_lda = ldas[[3]] 

topics <- tidy(fengshen_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
topics

## # A tibble: 382,870 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     1 一人  0.000468
##  2     2 一人  0.000324
##  3     3 一人  0.000483
##  4     4 一人  0.000495
##  5     5 一人  0.000379
##  6     6 一人  0.000121
##  7     7 一人  0.000358
##  8     8 一人  0.000553
##  9     9 一人  0.000144
## 10    10 一人  0.000485
## # ... with 382,860 more rows

尋找Topic的代表字

top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

主題命名

從出現的人物推測主題

topic_name = c('梅山七怪','殷郊與殷洪','蘇護','哪吒出世','聞太師','比干','十絕陣、趙公明','蘇妲己進宮','通天教主','姜子牙設計收鄧九公')

例如：
主題一出現楊戩、張奎、土行孫、元洪
主題四出現李靖、哪吒

Document-topic probabilities

theta矩陣

fen_documents <- tidy(fengshen_lda, matrix = "gamma")
fen_documents

## # A tibble: 1,000 x 3
##    document topic      gamma
##    <chr>    <int>      <dbl>
##  1 1            1 0.0000103 
##  2 2            1 0.00000504
##  3 3            1 0.00000604
##  4 4            1 0.0000107 
##  5 5            1 0.0000110 
##  6 6            1 0.00000625
##  7 7            1 0.00000603
##  8 8            1 0.00000466
##  9 9            1 0.00000671
## 10 10           1 0.00000815
## # ... with 990 more rows

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(fengshen_lda)
doc_pro <- tmResult$topics 
dim(doc_pro)               # nDocs(DTM) distributions over K topics

## [1] 100  10

cbind Document 主題分佈

# get document topic proportions 
document_topics <- doc_pro[fengshen_text$chapter,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
fengshen_topic = cbind(fengshen_text,document_topics_df)
fengshen_topic %>% head(10)

##    gutenberg_id
## 1         23910
## 2         23910
## 3         23910
## 4         23910
## 5         23910
## 6         23910
## 7         23910
## 8         23910
## 9         23910
## 10        23910
##                                                                    text chapter
## 1                  第一回<U+00A0><U+00A0><U+00A0><U+00A0>紂王女媧宮進香       1
## 2                                                            古風一首：       1
## 3  　　混沌初分盤古先，太極兩儀四象懸。子天丑地人寅出，避除獸患有巢賢。       1
## 4  　　燧人取火免鮮食，伏羲畫卦陰陽前。神農治世嚐百草，軒轅禮樂婚姻聯。       1
## 5  　　少昊五帝民物阜，禹王治水洪波蠲。承平享國至四百，桀王無道乾坤顛，       1
## 6  　　日縱妹喜荒酒色，成湯造亳洗腥羶，放桀南巢拯暴虐，雲霓如願後蘇全。       1
## 7  　　三十一世傳殷紂，商家脈絡如斷弦：紊亂朝綱絕倫紀，殺妻誅子信讒言，       1
## 8  　　穢污宮闈寵妲己，蠆盆炮烙忠貞冤，鹿臺聚斂萬姓苦，愁聲怨氣應障天，       1
## 9  　　直諫剖心盡焚炙，孕婦刳剔朝涉殲，崇信姦回棄朝政，屏逐師保性何偏，       1
## 10 　　郊社不修宗廟廢，奇技淫巧盡心研，昵此罪人乃罔畏，沉酗肆虐如鸇鳶。       1
##        梅山七怪   殷郊與殷洪         蘇護     哪吒出世       聞太師
## 1  1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 2  1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 3  1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 4  1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 5  1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 6  1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 7  1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 8  1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 9  1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
## 10 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05 1.029237e-05
##            比干 十絕陣、趙公明 蘇妲己進宮     通天教主 姜子牙設計收鄧九公
## 1  1.029237e-05   1.029237e-05  0.9999074 1.029237e-05       1.029237e-05
## 2  1.029237e-05   1.029237e-05  0.9999074 1.029237e-05       1.029237e-05
## 3  1.029237e-05   1.029237e-05  0.9999074 1.029237e-05       1.029237e-05
## 4  1.029237e-05   1.029237e-05  0.9999074 1.029237e-05       1.029237e-05
## 5  1.029237e-05   1.029237e-05  0.9999074 1.029237e-05       1.029237e-05
## 6  1.029237e-05   1.029237e-05  0.9999074 1.029237e-05       1.029237e-05
## 7  1.029237e-05   1.029237e-05  0.9999074 1.029237e-05       1.029237e-05
## 8  1.029237e-05   1.029237e-05  0.9999074 1.029237e-05       1.029237e-05
## 9  1.029237e-05   1.029237e-05  0.9999074 1.029237e-05       1.029237e-05
## 10 1.029237e-05   1.029237e-05  0.9999074 1.029237e-05       1.029237e-05

可以看到第一回每個主題的比例分布，其中以“蘇妲己進宮”比例最高

每章節的主題變化

fengshen_topic[,c(4:13)] =sapply(fengshen_topic[,c(4:13)] , as.numeric)
fengshen_topic %>% 
   select(-gutenberg_id) %>% 
  group_by(chapter) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "chapter")%>%
 ggplot( aes(x=chapter, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

每章節的主題比例變化

fengshen_topic[,c(4:13)] =sapply(fengshen_topic[,c(4:13)] , as.numeric)
fengshen_topic %>%
  select(-gutenberg_id) %>% 
  group_by(chapter) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "chapter")%>%
  group_by(chapter)%>%
  mutate(total_value =sum(value))%>%
  ggplot(aes(x=chapter, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + 
  ylab("proportion") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

可以發現封神演義的主題分布多為一章節一個主題，只有少數章節會出現兩個主題以上。

封神演義主題分析

2020/3/19

載入package

從古騰堡下載“封神演義”

斷出章節

Jieba斷詞、停用字

將資料轉換為Document Term Matrix (DTM)

建立主題模型

載入模型

透過perplexity找到最佳主題數

LDAvis

選topic數為10的結果

尋找Topic的代表字

主題命名

Document-topic probabilities

Document 主題分佈

cbind Document 主題分佈

每章節的主題變化

每章節的主題比例變化