組員:

  • 林子紘 B074020021
  • 彭璿祐 B064020029
  • 徐明暇 D084020002
  • 劉晉瑋 M094020006
  • 洪玟君 M094020030
  • 林永盛 M094020042

一、套件取得及資料載入

套件

library(data.table)
library(ggplot2)
library(dplyr)
library(jiebaR)
library(tidytext)
library(stringr)
library(tm)
library(topicmodels)
library(purrr)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
library(igraph)

資料描述

透過中山管院文字分析平台,載入聯合新聞網、蘋果新聞網、東森新聞網的新聞,搜尋關鍵字為「藻礁、三接、陳昭倫、潘忠政」,時間從2021/01/01到2021/05/15。
https://rpubs.com/mhhsu/topic_model_reef

metadata <- fread("news_reef_articleMetaData.csv", encoding = "UTF-8")

可以看到藻礁公投討論有幾波討論高點
1.在228連假時連署呼聲的新聞報導數量增加,2月中前幾乎沒有人知道,到2月中時國民黨羅智強呼籲連署藻礁公投,網路聲量往上衝,新聞報導增加
2.3/13藻礁公投連署書收69萬餘件,準備送進中選會進行公投成案
3.3/31農委會主委陳吉仲代表政府拜訪發起來潘忠政
4.4/22世界地球日蔡英文總統接見環團組織,含潘忠政對藻礁議題無交集
5.5/3政院提三接外推方案

metadata %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()

二、Document Term Matrix (DTM)

資料前處理

初始化一個斷詞引擎

jieba_tokenizer = worker(user="reef_dict.txt", stop_word = "reef_stop_words.txt")

news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

計算每篇文章各token出現次數

tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))  %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens 
##                                             artUrl     word count
##     1: https://news.ebc.net.tw/news/article/250089     一年     1
##     2: https://news.ebc.net.tw/news/article/250089     人士     1
##     3: https://news.ebc.net.tw/news/article/250089     人事     2
##     4: https://news.ebc.net.tw/news/article/250089 人事安排     2
##     5: https://news.ebc.net.tw/news/article/250089     人選     1
##    ---                                                           
## 68909:     https://udn.com/news/story/7314/5434564     環團     1
## 68910:     https://udn.com/news/story/7314/5434564     離岸     1
## 68911:     https://udn.com/news/story/7314/5434564     藻礁     1
## 68912:     https://udn.com/news/story/7314/5434564     議題     1
## 68913:     https://udn.com/news/story/7314/5434564     變更     4

將資料轉換為Document Term Matrix (DTM)

dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm
## <<DocumentTermMatrix (documents: 573, terms: 11969)>>
## Non-/sparse entries: 68913/6789324
## Sparsity           : 99%
## Maximal term length: 8
## Weighting          : term frequency (tf)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 13/87
## Sparsity           : 87%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## Sample             :
##                                              Terms
## Docs                                          一年 人士 人事 人事安排 人選 三方
##   https://news.ebc.net.tw/news/article/250089    1    1    2        2    1    1
##   https://news.ebc.net.tw/news/article/250897    0    2    0        0    0    0
##   https://news.ebc.net.tw/news/article/250926    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251058    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251166    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251281    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251438    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251477    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251725    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251792    0    0    0        0    0    0
##                                              Terms
## Docs                                          三接 上是 上海 大陸
##   https://news.ebc.net.tw/news/article/250089    3    1    1    3
##   https://news.ebc.net.tw/news/article/250897    7    0    0    0
##   https://news.ebc.net.tw/news/article/250926    0    0    0    0
##   https://news.ebc.net.tw/news/article/251058    0    0    0    0
##   https://news.ebc.net.tw/news/article/251166    0    0    0    0
##   https://news.ebc.net.tw/news/article/251281    0    0    0    0
##   https://news.ebc.net.tw/news/article/251438    0    0    0    0
##   https://news.ebc.net.tw/news/article/251477    0    0    0    0
##   https://news.ebc.net.tw/news/article/251725    1    0    0    0
##   https://news.ebc.net.tw/news/article/251792    0    0    0    0

三、主題模型

建立LDA模型

# lda <- LDA(dtm, k = 3, control = list(seed = 2021))
# lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs")
# alpha=50/k delta在TMWS平台測試為0.2有較好的效果(各主題中心的距離較遠),表示各主題的意義較有區隔 

lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs")
#調整alpha即delta
lda
## A LDA_Gibbs topic model with 5 topics.

利用LDA模型建立phi矩陣

topics_words <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words
## # A tibble: 59,845 x 3
##    topic term         phi
##    <int> <chr>      <dbl>
##  1     1 一年  0.000739  
##  2     2 一年  0.000521  
##  3     3 一年  0.000143  
##  4     4 一年  0.0000105 
##  5     5 一年  0.00000790
##  6     1 人士  0.000156  
##  7     2 人士  0.0000474 
##  8     3 人士  0.00350   
##  9     4 人士  0.0000631 
## 10     5 人士  0.00115   
## # ... with 59,835 more rows

尋找Topic的代表字

terms依照各主題的phi值由大到小排序,列出前10大

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

四、尋找最佳主題數

建立更多主題的主題模型

嘗試3,4,5,7,9個主題數,將結果存起來,再做進一步分析。 此部分需要跑一段時間,已經將跑完的檔案存成ldas_result_reef.rdata,可以直接載入

# lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs")
# ldas = c()
# topics = c(3,4,5,7,9)
# for(topic in topics){
#   start_time <- Sys.time()
#   lda <- LDA(dtm, k = topic, control = list(seed = 2021))
#   # lda <- LDA(dtm, k = topic, control = list(seed = 2021,alpha = (50/topic),delta=0.2),method = "Gibbs")
#   ldas =c(ldas,lda)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(ldas,file = "ldas_result_reef.rdata") # 將模型輸出成檔案
# }

載入每個主題的LDA結果

load("ldas_result_reef.rdata")

透過perplexity找到最佳主題數

topics = c(3,4,5,7,9)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

  • 主題數為5 Perplexity 低且下降趨緩

create LDAvis所需的json function
此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決,主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}

產生LDAvis結果

for(lda in ldas){

   k = lda@k ## lda 主題數
   if(k==2){next}
   json_res <- topicmodels_json_ldavis(lda,dtm)
   # serVis(json_res,open.browser = T)
   lda_dir =  paste0(k,"_ldavis")
   if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}

   serVis(json_res, out.dir =lda_dir, open.browser = F)

   writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
}


the_lda = ldas[[3]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
#這一行在windows並未開啟LdaVis網頁?? 
serVis(json_res,open.browser = T)

產生LDAvis檔案,存至local端

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

五、LDA分析

選定5個主題數的主題模型

# the_lda = ldas[[3]] ## 選定topic 為 5 的結果
the_lda_5 <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs") #主題數分為5個 
topics_words <- tidy(the_lda_5, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(50)
## # A tibble: 50 x 3
##    topic term        phi
##    <int> <chr>     <dbl>
##  1     4 連署     0.0667
##  2     1 藻礁     0.0635
##  3     3 民進黨   0.0331
##  4     5 環團     0.0286
##  5     4 藻礁公投 0.0276
##  6     2 方案     0.0272
##  7     3 國民黨   0.0237
##  8     5 溝通     0.0234
##  9     5 潘忠政   0.0218
## 10     3 公投     0.0211
## # ... with 40 more rows

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(15, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

去除共通詞彙,

removed_word = c("藻礁","表示","可以")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

主題命名

生態保護-討論主題為保育藻礁及海岸生態

# topics_name = c("生態保護","政府方案","政府與環團溝通","反綠營","能源政策") 政府與觀注方攻防
topics_name = c("桃園政府對藻礁生態/開發意向","政院三接方案","藍綠政治攻防","珍愛藻礁公投連署","政府/環團溝通")

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda_5)
doc_pro <- tmResult$topics #每篇文章的機率分佈 
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics) #將document_topics轉成dataframe 
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)

現在我們看每一篇的文章分佈了!

查看特定主題的文章

  • 透過找到特定文章的分佈進行排序之後,可以看到此主題的比重高的文章在討論什麼。
 news_topic %>% 
  arrange(desc(`珍愛藻礁公投連署`)) %>% select(artTitle,artDate,`珍愛藻礁公投連署`) %>% head(30) 

“珍愛藻礁公投連署” 主題多為3月中旬前藻礁公投連署訴求及連署活動

news_topic %>%
  arrange(desc(`政院三接方案`)) %>%  select(artTitle,artDate,`政院三接方案`) %>% head(30) 

“政院三接外推方案” 主題多為在確定進行公投後,5/3 政院所提的三接外推方案,以影響民眾投下不同意的動向

news_topic %>%
  arrange(desc(`政府/環團溝通`)) %>% select(artTitle,artDate,`政府/環團溝通`) %>% head(30) 

政府在3月下旬確定連署人數達70萬,開始找陳吉仲與環團溝通,與1月前的態度不同

news_topic %>%
  arrange(desc(`藍綠政治攻防`)) %>% select(artTitle,artDate,`藍綠政治攻防`) %>% head(30) 

“藍綠政黨攻防” 主題多為讓環保議題轉為政治議題,藍綠及其它政黨如時代力量的政治人物發表評論

了解主題在時間的變化

news_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m')) %>% #每月統計
  summarise_if(is.numeric, sum, na.rm = TRUE) %>% #每月發表在各主題的篇數
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors[c(1,5,8,12,15)])+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

3月份為連署階段, 國民黨江啟臣表態支持後,就有更多的政治人員表態。
5月最主要的議題王美花召開記者會,提出政院三接外推方案

去除筆數少月份

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>% 
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + #bar圖 
    scale_fill_manual(values=mycolors[c(1,5,8,12,15)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

以比例了解主題時間變化

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +  #找出佔比 
  geom_bar(stat = "identity") + ylab("proportion") + 
    scale_fill_manual(values=mycolors[c(1,5,8,12,18)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

“珍愛藻礁公投連署”隨著連署跨過安全門檻,該主題佔比逐月變少
伴隨而來是確定要公投成案,連署書背後所代表的民意,政府方尋求與環團溝通, 該主題佔比逐月增加

六、補充 - 不同訓練LDA模型套件

參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation

library(text2vec)
## Warning: package 'text2vec' was built under R version 4.0.5
## 
## Attaching package: 'text2vec'
## The following object is masked from 'package:igraph':
## 
##     normalize
## The following object is masked from 'package:topicmodels':
## 
##     perplexity
library(udpipe)
## Warning: package 'udpipe' was built under R version 4.0.5
tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

建立DTM matrix

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dim(dtm)
## [1]   573 11969
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 20)#少於20的matrix
dim(dtm_clean)
## [1] 573 884

LDA 模型

set.seed(20190)

topic_n = 5
#lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.004) #效果不錯
#以alpha 0.15 Beta=0.004 可得到獨立的主題
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.2, topic_word_prior = 0.004) #搭配主題為5 
# lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.2, topic_word_prior = 0.004) #搭配主題為4 
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)
## INFO  [20:50:25.477] early stopping at 130 iteration 
## INFO  [20:50:25.681] early stopping at 40 iteration

這個比topicmodels的package跑快超多倍

一樣可以用LDAvis的套件來看

lda_model$get_top_words(n = 30, lambda = 0.5) ## 查看 前30主題字
##       [,1]       [,2]         [,3]       [,4]           [,5]      
##  [1,] "藻礁"     "潘忠政"     "連署"     "台灣"         "方案"    
##  [2,] "生態"     "陳吉仲"     "萬份"     "供電"         "外推"    
##  [3,] "大潭藻礁" "溝通"       "民眾"     "王美花"       "民進黨"  
##  [4,] "中油"     "環團"       "藻礁公投" "環保"         "提出"    
##  [5,] "保護"     "領銜人"     "國民黨"   "問題"         "能源轉型"
##  [6,] "保育"     "聯盟"       "公投"     "燃煤"         "行政院"  
##  [7,] "三接"     "總統"       "門檻"     "蘇貞昌"       "替代"    
##  [8,] "海岸"     "藻礁公投"   "中選會"   "穩定"         "說明"    
##  [9,] "桃園"     "蔡總統"     "珍愛"     "天然氣"       "執政"    
## [10,] "鄭文燦"   "見面"       "總部"     "電力"         "蔡英文"  
## [11,] "公頃"     "農委會主委" "支持"     "爭議"         "立法院"  
## [12,] "開發"     "公投"       "政治"     "能源"         "政府"    
## [13,] "破壞"     "雙方"       "市府"     "機組"         "是否"    
## [14,] "工程"     "代表"       "萊豬"     "發電"         "政策"    
## [15,] "環境"     "政府"       "議題"     "這裡"         "國民黨"  
## [16,] "孫大千"   "聽證會"     "收到"     "公投流程"     "解決"    
## [17,] "市長"     "農委會"     "成案"     "公投進度"     "討論"    
## [18,] "生態系"   "會議"       "呼籲"     "圖解"         "議題"    
## [19,] "保護區"   "媒體"       "發起"     "增加"         "時程"    
## [20,] "環保署"   "決定"       "桃園市"   "燃氣"         "雙贏"    
## [21,] "自然"     "林飛帆"     "公投小組" "選擇"         "發言人"  
## [22,] "學者"     "會面"       "突破"     "馬英九"       "進行"    
## [23,] "環評"     "舉辦"       "志工"     "大潭電廠"     "意見"    
## [24,] "面積"     "對話"       "參與"     "看懂"         "黨團"    
## [25,] "承諾"     "所有"       "小組"     "台北"         "積極"    
## [26,] "傷害"     "決策"       "政黨"     "攻防"         "朋友"    
## [27,] "經濟部"   "主文"       "力量"     "興建"         "三接"    
## [28,] "範圍"     "摸頭"       "份數"     "天然氣接收站" "公尺"    
## [29,] "停工"     "何宗勳"     "珍愛藻礁" "減少"         "電廠"    
## [30,] "觀塘"     "公民"       "主席"     "藻礁公投"     "希望"
lda_model$plot() 
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)

這個LDA模型套件(text2vec),所找出的五個主題,LDAVis呈現主題有
1.府院方案-三接外推,供電能源轉型評估
2.政府(陳吉仲,蔡總統)與環團代表溝通
3.大潭藻礁位於桃園,桃園市長鄭文燦對藻礁議題的發言
4.綠政黨對藻礁議題連結就是支時重啟核四,但又是前總統馬英九封存核四的,藍營則反駁抹黑造謠等。
5.珍愛藻礁公投連署活動

http://127.0.0.1:4321/#topic=1&lambda=0.6&term=

七、網路分析

建立網路關係

# topics_name = c("桃園政府對藻礁生態/開發意向","政院三接方案","藍綠政治攻防","珍愛藻礁公投連署","政府/環團溝通")
#基本檔 
news_topic_basic <- tibble(
  topic=c(1:5),
  topics_name=topics_name
)

news1_topic_tmp<- news_topic %>% select (artUrl,topics_name) %>% 
  mutate(document=1:n())  %>% 
  select (document,topics_name,document) 
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(topics_name)` instead of `topics_name` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
news1_topic_tmp<-  gather(news1_topic_tmp,topics_name,"gamma",-document)

news_gamma <-
news1_topic_tmp %>% 
  left_join(news_topic_basic)  %>% #每個document對應為1~5個topic 
  arrange(document)
## Joining, by = "topics_name"
#計算每個主題的新聞數量
news_gamma <- news_gamma %>% 
  group_by(document) %>% 
  filter(gamma == (max(gamma))) #gamma值最大歸屬於該主題 

table(news_gamma$topic)
## 
##   1   2   3   4   5 
##  94 154 111  94 126
link <- news_gamma %>% 
  # 只篩選 gamma 值大於 0.6
  filter(gamma > 0.6) %>%
  # 把 gamma 值當成 link 的權重
  rename(weight = gamma)

# 把 topic 欄位的 1 取代成 topic 1,依此類推,避免與 document 的 1 混淆 
link$topic <- link$topic %>%
  gsub(1, "topic 1", .) %>%
  gsub(2, "topic 2", .) %>%
  gsub(3, "topic 3", .) %>%
  gsub(4, "topic 4", .) %>%
  gsub(5, "topic 5", .)

# 建立無向圖
TopicNetwork <- graph_from_data_frame(d = link, directed = F)
TopicNetwork
## IGRAPH c7c62bf UNW- 77 72 -- 
## + attr: name (v/c), weight (e/n), topic (e/c)
## + edges from c7c62bf (vertex names):
##  [1] 20 --珍愛藻礁公投連署            26 --珍愛藻礁公投連署           
##  [3] 33 --珍愛藻礁公投連署            35 --珍愛藻礁公投連署           
##  [5] 37 --珍愛藻礁公投連署            38 --珍愛藻礁公投連署           
##  [7] 39 --珍愛藻礁公投連署            40 --珍愛藻礁公投連署           
##  [9] 43 --珍愛藻礁公投連署            44 --珍愛藻礁公投連署           
## [11] 45 --珍愛藻礁公投連署            48 --珍愛藻礁公投連署           
## [13] 52 --珍愛藻礁公投連署            61 --珍愛藻礁公投連署           
## [15] 63 --珍愛藻礁公投連署            64 --珍愛藻礁公投連署           
## + ... omitted several edges
set.seed(2021)
plot(TopicNetwork)

# 調整點線大小且不顯示節點名稱
#
plot(TopicNetwork, vertex.size = 10, edge.arrow.size = .5, vertex.label = NA)

# 顯示有超過 5 個關聯的節點名稱
plot(TopicNetwork, vertex.size = 10, edge.arrow.size = .5,
     vertex.label = ifelse(degree(TopicNetwork) > 5, V(TopicNetwork)$name, NA), vertex.label.font = 15)

> Gamma值>0.6 幾個topic分佈均頗為獨立

vertex_attr(TopicNetwork)
## $name
##  [1] "20"                          "26"                         
##  [3] "33"                          "35"                         
##  [5] "37"                          "38"                         
##  [7] "39"                          "40"                         
##  [9] "43"                          "44"                         
## [11] "45"                          "48"                         
## [13] "52"                          "61"                         
## [15] "63"                          "64"                         
## [17] "65"                          "74"                         
## [19] "82"                          "86"                         
## [21] "87"                          "89"                         
## [23] "90"                          "94"                         
## [25] "95"                          "104"                        
## [27] "125"                         "133"                        
## [29] "150"                         "165"                        
## [31] "175"                         "183"                        
## [33] "209"                         "211"                        
## [35] "221"                         "225"                        
## [37] "228"                         "256"                        
## [39] "257"                         "258"                        
## [41] "261"                         "262"                        
## [43] "278"                         "280"                        
## [45] "286"                         "308"                        
## [47] "343"                         "344"                        
## [49] "348"                         "352"                        
## [51] "369"                         "376"                        
## [53] "377"                         "378"                        
## [55] "382"                         "385"                        
## [57] "394"                         "395"                        
## [59] "440"                         "441"                        
## [61] "456"                         "457"                        
## [63] "464"                         "507"                        
## [65] "513"                         "521"                        
## [67] "527"                         "530"                        
## [69] "557"                         "566"                        
## [71] "568"                         "573"                        
## [73] "珍愛藻礁公投連署"            "藍綠政治攻防"               
## [75] "桃園政府對藻礁生態/開發意向" "政院三接方案"               
## [77] "政府/環團溝通"

設定連結屬性

# 設定連結的 type 為主題分類
E(TopicNetwork)$type <- link$topic
# 設定 weight
E(TopicNetwork)$weight <- link$weight

# edge_attr(TopicNetwork) 

Density

edge_density(TopicNetwork)
## [1] 0.02460697
reciprocity(TopicNetwork)
## [1] 1

無向圖, reciprocity=1

transitivity(TopicNetwork, type="global")
## [1] 0
diameter(TopicNetwork, directed = F)
## [1] 1.565776
mean_distance(TopicNetwork, directed=F)
## [1] 1.88697
deg <- degree(TopicNetwork, mode = "all")
plot(TopicNetwork, 
     # 依照 degree 大小設定節點大小
     vertex.size = deg * 3,
     # 只顯示 degree 大於 5 的節點名稱
     vertex.label = ifelse(deg > 5, V(TopicNetwork)$name, NA))

> 珍愛藻礁公投連署最多degree , 概念上表示辨識為該topic的新聞數量最多

# Histogram of node degree
hist(deg, breaks = 1:vcount(TopicNetwork)-1, main = "Histogram of node degree")

# Degree distribution
deg.dist <- degree_distribution(TopicNetwork, cumulative=T, mode="all")
plot( x=0:max(deg), y=1-deg.dist, pch=19, cex=1.2, col="orange", xlab="Degree", ylab="Cumulative Frequency")

y軸為累積的出現頻率,Degree 1~2點的差距較大,表示degree 2的節點數最多

# Degree Centrality 

degree(TopicNetwork, mode = "all") 
##                          20                          26 
##                           1                           1 
##                          33                          35 
##                           1                           1 
##                          37                          38 
##                           1                           1 
##                          39                          40 
##                           1                           1 
##                          43                          44 
##                           1                           1 
##                          45                          48 
##                           1                           1 
##                          52                          61 
##                           1                           1 
##                          63                          64 
##                           1                           1 
##                          65                          74 
##                           1                           1 
##                          82                          86 
##                           1                           1 
##                          87                          89 
##                           1                           1 
##                          90                          94 
##                           1                           1 
##                          95                         104 
##                           1                           1 
##                         125                         133 
##                           1                           1 
##                         150                         165 
##                           1                           1 
##                         175                         183 
##                           1                           1 
##                         209                         211 
##                           1                           1 
##                         221                         225 
##                           1                           1 
##                         228                         256 
##                           1                           1 
##                         257                         258 
##                           1                           1 
##                         261                         262 
##                           1                           1 
##                         278                         280 
##                           1                           1 
##                         286                         308 
##                           1                           1 
##                         343                         344 
##                           1                           1 
##                         348                         352 
##                           1                           1 
##                         369                         376 
##                           1                           1 
##                         377                         378 
##                           1                           1 
##                         382                         385 
##                           1                           1 
##                         394                         395 
##                           1                           1 
##                         440                         441 
##                           1                           1 
##                         456                         457 
##                           1                           1 
##                         464                         507 
##                           1                           1 
##                         513                         521 
##                           1                           1 
##                         527                         530 
##                           1                           1 
##                         557                         566 
##                           1                           1 
##                         568                         573 
##                           1                           1 
##            珍愛藻礁公投連署                藍綠政治攻防 
##                          25                           8 
## 桃園政府對藻礁生態/開發意向                政院三接方案 
##                          14                          14 
##               政府/環團溝通 
##                          11
centr_degree(TopicNetwork, mode = "all", normalized = T)
## $res
##  [1]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [26]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [51]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 25  8 14
## [76] 14 11
## 
## $centralization
## [1] 0.3043404
## 
## $theoretical_max
## [1] 5852
  • 各主題 (topic1, topic2, topic3, topic4,topic5 ) 的節點的 Degree Centrality 較高
  • res : 節點中心度
  • centralization
  • theoretical_max : 最大中心化分數
# eigen_centrality(TopicNetwork, directed = F, weights = NA)
# centr_eigen(TopicNetwork, directed = F, normalized = T)
betweenness(TopicNetwork, directed = F) %>% head()
## 20 26 33 35 37 38 
##  0  0  0  0  0  0
edge_betweenness(TopicNetwork, directed = F)
##  [1] 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25  8 25  8 25 14 25 14 25
## [26] 25 14 11 11 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 25  8  8 14 14 14
## [51] 11 11 11 11 11  8 11 14 11 11 25  8 14 14  8  8 14 14 14 25 14 11
centr_betw(TopicNetwork, directed = F, normalized = T)
## $res
##  [1]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [20]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [39]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [58]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 300  28  91  91
## [77]  55
## 
## $centralization
## [1] 0.1040397
## 
## $theoretical_max
## [1] 216600
# Find cliques 
net.sym <- as.undirected(TopicNetwork, mode = "collapse", edge.attr.comb = list(weight = "sum", "ignore"))

#cliques(net.sym) # list of cliques       
sapply(cliques(net.sym), length) # clique sizes
##   [1] 1 1 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
##  [38] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
##  [75] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
## [112] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
## [149] 2
#largest_cliques(net.sym) # cliques with max number of nodes 
vcol <- rep("grey80", vcount(net.sym))
vcol[unlist(largest_cliques(net.sym))] <- "gold"
plot(as.undirected(net.sym), vertex.label=V(net.sym)$name, vertex.color=vcol) 

# Community detection 

ceb <- cluster_edge_betweenness(TopicNetwork)  
#dendPlot(ceb, mode="hclust")
plot(ceb, TopicNetwork) 

phi_m <- topics_words %>% arrange(desc(phi)) %>% top_n(70)
## Selecting by phi
dtm <-phi_m %>% cast_dtm(topic, term, phi)

dtmm<-as.matrix(dtm)
dim(dtmm)
## [1]  5 62
network=graph_from_incidence_matrix(dtmm)

# plot
set.seed(3)
plot(network, ylim=c(-1,1), xlim=c(-1,1), asp = 0,
     vertex.label.cex=0.7, vertex.size=10, vertex.label.family = "Heiti TC Light")
## Warning in text.default(x, y, labels = labels, col = label.color, family =
## label.family, : font family not found in Windows font database