系統設置

安裝需要的packages

packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

載入packages

library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(purrr)
require(tm)
require(data.table)
require(stringr)
require(udpipe)
require(LDAvis)
require(wordcloud2)
require(webshot)
require(htmlwidgets)
require(servr)
require(ramify)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

讀取資料

資料描述

  • 透過中山管院文字分析平台在PTT上,搜尋關鍵字:[如懿傳、如懿],時間從2016-06-01~2020-03-31,約46個月的資料,總篇數為1396篇,留言資料68553筆。

載入PTT資料

# 文章資料
Ruyi_china_drama_articleMetaData <- read_csv("D:/OC Learn/NSYSU/Social Media Analysis/Final Paper/data/Ruyi-china drama_articleMetaData.csv")

Ruyi_articleMetaData <- read_csv("D:/OC Learn/NSYSU/Social Media Analysis/Final Paper/data/Ruyi_articleMetaData.csv")

posts <- rbind(Ruyi_china_drama_articleMetaData,Ruyi_articleMetaData)

posts
# 回覆資料
Ruyi_china_drama_articleReviews <- read_csv("D:/OC Learn/NSYSU/Social Media Analysis/Final Paper/data/Ruyi-china drama_articleReviews.csv")

Ruyi_articleReviews <- read_csv("D:/OC Learn/NSYSU/Social Media Analysis/Final Paper/data/Ruyi_articleReviews.csv")

reviews <- rbind(Ruyi_china_drama_articleReviews,Ruyi_articleReviews)

reviews
# 選取需要的欄位
reviews <- reviews %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)
reviews

資料預覽

posts %>% 
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="blue", size=1)+
    theme_classic()+
    geom_vline(aes(xintercept = as.numeric(artDate[which(artDate == as.Date('2018-09-14'))
[1]])),colour = "red") 

#最高鋒為2018/9/14,超過15篇

發文者數量

length(unique(posts$artPoster)) #發文者共331人
## [1] 331

回覆者數量

length(unique(reviews$cmtPoster)) #回覆者共6,289人
## [1] 6289

總共有參與的人數

allPoster <- c(posts$artPoster, reviews$cmtPoster)
length(unique(allPoster)) #回覆者共6,413人, 與331+6,289不相等,表示有人同時發言和回覆
## [1] 6413

整理所有參與人

# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%posts$artPoster, "poster", "replyer"))
userList

建立社群網路圖

將原文與回覆Join起來

# 把原文與回覆依據artUrl innerJoin起來
posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl")
posts_Reviews

篩選欄位

# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- posts_Reviews %>%
      select(cmtPoster, artPoster, artUrl)
link

建立網路關係

reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork
## IGRAPH c51e8a1 DN-- 6413 94227 -- 
## + attr: name (v/c), artUrl (e/c)
## + edges from c51e8a1 (vertex names):
##  [1] a431170     ->tiger5 misskate    ->tiger5 yellow0304  ->tiger5
##  [4] fruitmonster->tiger5 fruitmonster->tiger5 misskate    ->tiger5
##  [7] misskate    ->tiger5 misskate    ->tiger5 misskate    ->tiger5
## [10] misskate    ->tiger5 misskate    ->tiger5 misskate    ->tiger5
## [13] misskate    ->tiger5 Hatemilktea ->tiger5 vini770803  ->tiger5
## [16] myra3335    ->tiger5 myra3335    ->tiger5 myra3335    ->tiger5
## [19] myra3335    ->tiger5 myra3335    ->tiger5 myra3335    ->tiger5
## [22] energy4924  ->tiger5 Melody1340  ->tiger5 Melody1340  ->tiger5
## + ... omitted several edges

網路圖

# 畫出網路圖, 完全看不到東西
plot(reviewNetwork)

> 可以發現密密麻麻的東西,完全看不出個所以然。我們試着放少一點的東西來試試。

調整參數

# 把點點的大小和線的粗細調小,並不顯示使用者賬號,文章數量太多,依然看不出什麼
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,vertex.label=NA)

> 還是沒什麼好解釋的,我們試着縮小文章的數量看看。

資料篩選

挑出2018-09-14當天的文章和它的回覆

link <- posts_Reviews %>%
      filter(artDate == as.Date('2018-09-14')) %>%
      select(cmtPoster, artPoster, artUrl) %>% 
      unique()
link

過濾圖中的點(v)

# 這邊要篩選link中有出現的使用者
# 因爲如果userList(igraph中graph_from_data_frame的v參數吃的那個東西)中出現了沒有在link中出現的使用者
# 也會被igraph畫上去,圖片就會變得沒有意義
# 想要看會變怎麼樣的人可以跑一下下面的code
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
filtered_user

未過濾使用者

## 圖面結果呈現像向日葵的花蕊,所以也看不出來
reviewNetwork <- graph_from_data_frame(d=link, v=userList, directed=T)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)

過濾使用者後

set.seed(567)
# 先畫出無向圖,圖面已呈現有一些關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=5, edge.arrow.size=0.3,vertex.label=NA)

加強圖像的顯示資訊(1)

set.seed(567)
# 用使用者的身份來區分點的顏色,如果有發文的話是金色的,只有回覆文章的則用淺藍色表示,稍微有一些分別出現
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=5, edge.arrow.size=0.3,vertex.label=NA)

加強圖像的顯示資訊(2)

set.seed(567)
# 篩選要顯示出的使用者,顯示有超過5個關聯的使用者,有多位使用
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)

V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=5, edge.arrow.size=0.2,
     vertex.label=ifelse(degree(reviewNetwork) > 5, V(reviewNetwork)$label, NA),  vertex.label.font=2)

加強圖像的顯示資訊(3)

set.seed(567)
# 調高顯示有超過50個關聯的使用者,有4位使用者
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)

V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=5, edge.arrow.size=0.2,
     vertex.label=ifelse(degree(reviewNetwork) > 50, V(reviewNetwork)$label, NA),  vertex.label.font=2)

主題分類

Tokenization

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
tokens <- posts %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens %>%head(20)

將資料轉換為Document Term Matrix (DTM)

news_dtm <-tokens %>% cast_dtm(artUrl, word, count)
news_dtm
## <<DocumentTermMatrix (documents: 692, terms: 27394)>>
## Non-/sparse entries: 107315/18849333
## Sparsity           : 99%
## Maximal term length: 11
## Weighting          : term frequency (tf)

建立LDA模型

lda <- LDA(news_dtm, k = 2, control = list(seed = 123))

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

topics <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics

從topics中可以得到特定主題生成特定詞彙的概率。

尋找Topic的代表字

top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

> 透過上方的圖,感覺兩個主題看起來差不多,沒有看出兩者的差異,嘗試看看分多一點topics

更多主題

  • 嘗試2,3,7,9,36主題數,將結果存起來,再做進一步分析
ldas = c()
topics = c(2,3,7,9,36)
for(topic in topics){
start_time <- Sys.time()
lda <- LDA(news_dtm, k = topic, control = list(seed = 123))
ldas =c(ldas,lda)
print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
save(ldas,file = "ldas_result")
 }

這邊要跑N個小時,已將主題結果存在lda_result

載入每個主題的LDA結果

load("ldas_result")

結果存在ldas這個變數裡面,可以透過ldas[[x]] 來找到想要的主題數

透過perplexity找到最佳主題數

topics = c(2,3,7,9,36)
data_frame(k = topics,
           perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

perplexity 越小越好,但是太小的話,主題數會分太細。通常會找一個主題數適當,且perplexity比較低的主題

LDAvis

產生create LDAvis所需的json function

  • 此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。
topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
   ###以下function 用來解決,主題數多會出現NA的問題
  ## 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi)
{
  jensenShannon <- function(x, y) {
      m <- 0.5 * (x + y)
    lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
    rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
    0.5 * sum(lhs) + 0.5 * sum(rhs)
  }
  dist.mat <- proxy::dist(x = phi, method = jensenShannon)
  pca.fit <- stats::cmdscale(dist.mat, k = 2)
  data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}

    # Find required quantities
    phi <- as.matrix(posterior(fitted)$terms)
    theta <- as.matrix(posterior(fitted)$topics)
    vocab <- colnames(phi)
    term_freq <- slam::col_sums(doc_term)

    # Convert to json
    json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                            vocab = vocab,
                            doc.length = as.vector(table(doc_term$i)),
                            term.frequency = term_freq, mds.method = ls_LDA)

    return(json_lda)
}

產生LDAvis結果

topic_10 = ldas[[4]]
json_res <- topicmodels_json_ldavis(topic_10,news_dtm)

serVis(json_res,open.browser = T)

LDA後續分析

new_lda = ldas[[4]] ## 選定topic 為9 的結果

topics <- tidy(new_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics

尋找Topic的代表字

  • 整理出每一個Topic中生成概率最高的10個詞彙。
top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

> 可以看到topic都被一開始所使用的搜尋關鍵字影響看不出每一群的差異。

  • 移除常出現、跨主題共享的詞彙。
remove_word = c("如懿傳","如懿","皇帝","皇上","皇后","太后","臣妾","自己","一個","沒有","覺得","因為","所以","我們","這個","還是","這樣","就是","不是","可以","其實","真的","不要","知道","看到","什麼","大家")
top_terms <- topics %>%
  filter(!term  %in% remove_word)%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

主題命名

topic_name = c('劇情猜測1','角色選角1','劇情討論1','劇情猜測2','角色選角2','小說與電視劇比較','演員','劇情討論2','與延禧攻略比較')

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(new_lda)
doc_pro <- tmResult$topics 
dim(doc_pro)               # nDocs(DTM) distributions over K topics
## [1] 692   9

每篇文章都有topic的分佈,所以692筆的文章*9個主題

cbind Document 主題分佈

# get document topic proportions 
document_topics <- doc_pro[posts$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
new_topic = cbind(posts,document_topics_df)
new_topic %>% head(10)

現在我們看每一篇的文章分佈了!

查看特定主題的文章

  • 透過找到特定文章的分佈進行排序之後,可以看到此主題的比重高的文章在討論什麼。
 new_topic %>%
    arrange(desc(`與延禧攻略比較`)) %>%head(100) 

可以看到這個主題,討論到有關二部劇情、服裝造型、角色對比、與其他陸劇比較、演員比較

library(text2vec)
## 
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
## 
##     perplexity
## The following object is masked from 'package:igraph':
## 
##     normalize
library(udpipe)
tokens <- posts %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

建立DTM matrix

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)
## [1] 692 752

LDA 模型

set.seed(123)

topic_n = 9

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)
## INFO  [23:06:23.072] early stopping at 220 iteration 
## INFO  [23:06:23.377] early stopping at 60 iteration

這個比topicmodels的package跑快超多倍

一樣可以用LDAvis的套件來看

lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字
##       [,1]   [,2]     [,3]   [,4]   [,5]     [,6]     [,7]     [,8]     [,9]  
##  [1,] "自己" "覺得"   "延禧" "劇本" "真的"   "如懿"   "建華"   "小說"   "皇帝"
##  [2,] "最後" "皇后"   "攻略" "這個" "茶碗"   "金玉"   "周迅"   "預告"   "如懿"
##  [3,] "不能" "貴人"   "劇情" "故事" "討論"   "皇上"   "飾演"   "電視劇" "阿哥"
##  [4,] "冷宮" "貴妃"   "播出" "觀眾" "覺得"   "臣妾"   "網友"   "如懿"   "太后"
##  [5,] "這些" "嬪妃"   "人物" "編劇" "演技"   "乾隆"   "中國"   "一樣"   "皇后"
##  [6,] "一句" "宮女"   "服裝" "我們" "大家"   "凌雲"   "拍攝"   "開始"   "貴妃"
##  [7,] "陷害" "太監"   "宮廷" "愛情" "還是"   "不是"   "汪俊"   "青櫻"   "凌雲"
##  [8,] "因為" "一直"   "瓔珞" "一個" "如懿傳" "衛嬿婉" "林心如" "期待"   "海蘭"
##  [9,] "一個" "那拉氏" "比較" "一些" "其實"   "什麼"   "報導"   "婚姻"   "公主"
## [10,] "已經" "福晉"   "好看" "不同" "時候"   "只是"   "大陸"   "悲劇"   "永琪"
lda_model$plot()
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)