組員:

林子紘 B074020021
彭璿祐 B064020029
徐明暇 D084020002
劉晉瑋 M094020006
洪玟君 M094020030
林永盛 M094020042

一、套件取得及資料載入

套件

library(data.table)
library(ggplot2)
library(dplyr)
library(jiebaR)
library(tidytext)
library(stringr)
library(tm)
library(topicmodels)
library(purrr)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
library(igraph)

資料描述

透過中山管院文字分析平台，載入聯合新聞網、蘋果新聞網、東森新聞網的新聞，搜尋關鍵字為「藻礁、三接、陳昭倫、潘忠政」，時間從2021/01/01到2021/05/15。
https://rpubs.com/mhhsu/topic_model_reef

metadata <- fread("news_reef_articleMetaData.csv", encoding = "UTF-8")

可以看到藻礁公投討論有幾波討論高點
1.在228連假時連署呼聲的新聞報導數量增加，2月中前幾乎沒有人知道，到2月中時國民黨羅智強呼籲連署藻礁公投,網路聲量往上衝，新聞報導增加
2.3/13藻礁公投連署書收69萬餘件,準備送進中選會進行公投成案
3.3/31農委會主委陳吉仲代表政府拜訪發起來潘忠政
4.4/22世界地球日蔡英文總統接見環團組織,含潘忠政對藻礁議題無交集
5.5/3政院提三接外推方案

metadata %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()

二、Document Term Matrix (DTM)

資料前處理

初始化一個斷詞引擎

jieba_tokenizer = worker(user="reef_dict.txt", stop_word = "reef_stop_words.txt")

news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

計算每篇文章各token出現次數

tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))  %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens

##                                             artUrl     word count
##     1: https://news.ebc.net.tw/news/article/250089     一年     1
##     2: https://news.ebc.net.tw/news/article/250089     人士     1
##     3: https://news.ebc.net.tw/news/article/250089     人事     2
##     4: https://news.ebc.net.tw/news/article/250089 人事安排     2
##     5: https://news.ebc.net.tw/news/article/250089     人選     1
##    ---                                                           
## 68909:     https://udn.com/news/story/7314/5434564     環團     1
## 68910:     https://udn.com/news/story/7314/5434564     離岸     1
## 68911:     https://udn.com/news/story/7314/5434564     藻礁     1
## 68912:     https://udn.com/news/story/7314/5434564     議題     1
## 68913:     https://udn.com/news/story/7314/5434564     變更     4

將資料轉換為Document Term Matrix (DTM)

dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm

## <<DocumentTermMatrix (documents: 573, terms: 11969)>>
## Non-/sparse entries: 68913/6789324
## Sparsity           : 99%
## Maximal term length: 8
## Weighting          : term frequency (tf)

inspect(dtm[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 13/87
## Sparsity           : 87%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## Sample             :
##                                              Terms
## Docs                                          一年 人士 人事 人事安排 人選 三方
##   https://news.ebc.net.tw/news/article/250089    1    1    2        2    1    1
##   https://news.ebc.net.tw/news/article/250897    0    2    0        0    0    0
##   https://news.ebc.net.tw/news/article/250926    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251058    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251166    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251281    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251438    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251477    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251725    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251792    0    0    0        0    0    0
##                                              Terms
## Docs                                          三接 上是 上海 大陸
##   https://news.ebc.net.tw/news/article/250089    3    1    1    3
##   https://news.ebc.net.tw/news/article/250897    7    0    0    0
##   https://news.ebc.net.tw/news/article/250926    0    0    0    0
##   https://news.ebc.net.tw/news/article/251058    0    0    0    0
##   https://news.ebc.net.tw/news/article/251166    0    0    0    0
##   https://news.ebc.net.tw/news/article/251281    0    0    0    0
##   https://news.ebc.net.tw/news/article/251438    0    0    0    0
##   https://news.ebc.net.tw/news/article/251477    0    0    0    0
##   https://news.ebc.net.tw/news/article/251725    1    0    0    0
##   https://news.ebc.net.tw/news/article/251792    0    0    0    0

三、主題模型

建立LDA模型

# lda <- LDA(dtm, k = 3, control = list(seed = 2021))
# lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs")
# alpha=50/k delta在TMWS平台測試為0.2有較好的效果(各主題中心的距離較遠)，表示各主題的意義較有區隔 

lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs")
#調整alpha即delta
lda

## A LDA_Gibbs topic model with 5 topics.

利用LDA模型建立phi矩陣

topics_words <- tidy(lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words

## # A tibble: 59,845 x 3
##    topic term         phi
##    <int> <chr>      <dbl>
##  1     1 一年  0.000739  
##  2     2 一年  0.000521  
##  3     3 一年  0.000143  
##  4     4 一年  0.0000105 
##  5     5 一年  0.00000790
##  6     1 人士  0.000156  
##  7     2 人士  0.0000474 
##  8     3 人士  0.00350   
##  9     4 人士  0.0000631 
## 10     5 人士  0.00115   
## # ... with 59,835 more rows

尋找Topic的代表字

terms依照各主題的phi值由大到小排序，列出前10大

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

四、尋找最佳主題數

建立更多主題的主題模型

嘗試3,4,5,7,9個主題數，將結果存起來，再做進一步分析。此部分需要跑一段時間，已經將跑完的檔案存成ldas_result_reef.rdata，可以直接載入

# lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs")
# ldas = c()
# topics = c(3,4,5,7,9)
# for(topic in topics){
#   start_time <- Sys.time()
#   lda <- LDA(dtm, k = topic, control = list(seed = 2021))
#   # lda <- LDA(dtm, k = topic, control = list(seed = 2021,alpha = (50/topic),delta=0.2),method = "Gibbs")
#   ldas =c(ldas,lda)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(ldas,file = "ldas_result_reef.rdata") # 將模型輸出成檔案
# }

載入每個主題的LDA結果

load("ldas_result_reef.rdata")

透過perplexity找到最佳主題數

topics = c(3,4,5,7,9)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

主題數為5 Perplexity 低且下降趨緩

create LDAvis所需的json function
此function是將前面使用 “LDA function”所建立的model，轉換為“LDAVis”套件的input格式。

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決，主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}

產生LDAvis結果

for(lda in ldas){

   k = lda@k ## lda 主題數
   if(k==2){next}
   json_res <- topicmodels_json_ldavis(lda,dtm)
   # serVis(json_res,open.browser = T)
   lda_dir =  paste0(k,"_ldavis")
   if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}

   serVis(json_res, out.dir =lda_dir, open.browser = F)

   writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
}


the_lda = ldas[[3]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
#這一行在windows並未開啟LdaVis網頁?? 
serVis(json_res,open.browser = T)

產生LDAvis檔案，存至local端

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

五、LDA分析

選定5個主題數的主題模型

# the_lda = ldas[[3]] ## 選定topic 為 5 的結果
the_lda_5 <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs") #主題數分為5個

topics_words <- tidy(the_lda_5, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(50)

## # A tibble: 50 x 3
##    topic term        phi
##    <int> <chr>     <dbl>
##  1     4 連署     0.0667
##  2     1 藻礁     0.0635
##  3     3 民進黨   0.0331
##  4     5 環團     0.0286
##  5     4 藻礁公投 0.0276
##  6     2 方案     0.0272
##  7     3 國民黨   0.0237
##  8     5 溝通     0.0234
##  9     5 潘忠政   0.0218
## 10     3 公投     0.0211
## # ... with 40 more rows

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(15, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

去除共通詞彙，

removed_word = c("藻礁","表示","可以")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

主題命名

生態保護-討論主題為保育藻礁及海岸生態

# topics_name = c("生態保護","政府方案","政府與環團溝通","反綠營","能源政策") 政府與觀注方攻防
topics_name = c("桃園政府對藻礁生態/開發意向","政院三接方案","藍綠政治攻防","珍愛藻礁公投連署","政府/環團溝通")

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda_5)
doc_pro <- tmResult$topics #每篇文章的機率分佈 
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics) #將document_topics轉成dataframe 
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)

現在我們看每一篇的文章分佈了！

查看特定主題的文章

透過找到特定文章的分佈進行排序之後，可以看到此主題的比重高的文章在討論什麼。

 news_topic %>% 
  arrange(desc(`珍愛藻礁公投連署`)) %>% select(artTitle,artDate,`珍愛藻礁公投連署`) %>% head(30)

“珍愛藻礁公投連署” 主題多為3月中旬前藻礁公投連署訴求及連署活動

news_topic %>%
  arrange(desc(`政院三接方案`)) %>%  select(artTitle,artDate,`政院三接方案`) %>% head(30)

“政院三接外推方案” 主題多為在確定進行公投後,5/3 政院所提的三接外推方案，以影響民眾投下不同意的動向

news_topic %>%
  arrange(desc(`政府/環團溝通`)) %>% select(artTitle,artDate,`政府/環團溝通`) %>% head(30)

政府在3月下旬確定連署人數達70萬，開始找陳吉仲與環團溝通,與1月前的態度不同

news_topic %>%
  arrange(desc(`藍綠政治攻防`)) %>% select(artTitle,artDate,`藍綠政治攻防`) %>% head(30)

“藍綠政黨攻防” 主題多為讓環保議題轉為政治議題，藍綠及其它政黨如時代力量的政治人物發表評論

了解主題在時間的變化

news_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m')) %>% #每月統計
  summarise_if(is.numeric, sum, na.rm = TRUE) %>% #每月發表在各主題的篇數
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors[c(1,5,8,12,15)])+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

3月份為連署階段, 國民黨江啟臣表態支持後,就有更多的政治人員表態。
5月最主要的議題王美花召開記者會,提出政院三接外推方案

去除筆數少月份

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>% 
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + #bar圖 
    scale_fill_manual(values=mycolors[c(1,5,8,12,15)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

以比例了解主題時間變化

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +  #找出佔比 
  geom_bar(stat = "identity") + ylab("proportion") + 
    scale_fill_manual(values=mycolors[c(1,5,8,12,18)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

“珍愛藻礁公投連署”隨著連署跨過安全門檻,該主題佔比逐月變少
伴隨而來是確定要公投成案,連署書背後所代表的民意,政府方尋求與環團溝通, 該主題佔比逐月增加

六、補充 - 不同訓練LDA模型套件

參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation

library(text2vec)

## Warning: package 'text2vec' was built under R version 4.0.5

## 
## Attaching package: 'text2vec'

## The following object is masked from 'package:igraph':
## 
##     normalize

## The following object is masked from 'package:topicmodels':
## 
##     perplexity

library(udpipe)

## Warning: package 'udpipe' was built under R version 4.0.5

tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

建立DTM matrix

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dim(dtm)

## [1]   573 11969

dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 20)#少於20的matrix
dim(dtm_clean)

## [1] 573 884

LDA 模型

set.seed(20190)

topic_n = 5
#lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.004) #效果不錯
#以alpha 0.15 Beta=0.004 可得到獨立的主題
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.2, topic_word_prior = 0.004) #搭配主題為5 
# lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.2, topic_word_prior = 0.004) #搭配主題為4 
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)

## INFO  [20:50:25.477] early stopping at 130 iteration 
## INFO  [20:50:25.681] early stopping at 40 iteration

這個比topicmodels的package跑快超多倍

一樣可以用LDAvis的套件來看

lda_model$get_top_words(n = 30, lambda = 0.5) ## 查看 前30主題字

##       [,1]       [,2]         [,3]       [,4]           [,5]      
##  [1,] "藻礁"     "潘忠政"     "連署"     "台灣"         "方案"    
##  [2,] "生態"     "陳吉仲"     "萬份"     "供電"         "外推"    
##  [3,] "大潭藻礁" "溝通"       "民眾"     "王美花"       "民進黨"  
##  [4,] "中油"     "環團"       "藻礁公投" "環保"         "提出"    
##  [5,] "保護"     "領銜人"     "國民黨"   "問題"         "能源轉型"
##  [6,] "保育"     "聯盟"       "公投"     "燃煤"         "行政院"  
##  [7,] "三接"     "總統"       "門檻"     "蘇貞昌"       "替代"    
##  [8,] "海岸"     "藻礁公投"   "中選會"   "穩定"         "說明"    
##  [9,] "桃園"     "蔡總統"     "珍愛"     "天然氣"       "執政"    
## [10,] "鄭文燦"   "見面"       "總部"     "電力"         "蔡英文"  
## [11,] "公頃"     "農委會主委" "支持"     "爭議"         "立法院"  
## [12,] "開發"     "公投"       "政治"     "能源"         "政府"    
## [13,] "破壞"     "雙方"       "市府"     "機組"         "是否"    
## [14,] "工程"     "代表"       "萊豬"     "發電"         "政策"    
## [15,] "環境"     "政府"       "議題"     "這裡"         "國民黨"  
## [16,] "孫大千"   "聽證會"     "收到"     "公投流程"     "解決"    
## [17,] "市長"     "農委會"     "成案"     "公投進度"     "討論"    
## [18,] "生態系"   "會議"       "呼籲"     "圖解"         "議題"    
## [19,] "保護區"   "媒體"       "發起"     "增加"         "時程"    
## [20,] "環保署"   "決定"       "桃園市"   "燃氣"         "雙贏"    
## [21,] "自然"     "林飛帆"     "公投小組" "選擇"         "發言人"  
## [22,] "學者"     "會面"       "突破"     "馬英九"       "進行"    
## [23,] "環評"     "舉辦"       "志工"     "大潭電廠"     "意見"    
## [24,] "面積"     "對話"       "參與"     "看懂"         "黨團"    
## [25,] "承諾"     "所有"       "小組"     "台北"         "積極"    
## [26,] "傷害"     "決策"       "政黨"     "攻防"         "朋友"    
## [27,] "經濟部"   "主文"       "力量"     "興建"         "三接"    
## [28,] "範圍"     "摸頭"       "份數"     "天然氣接收站" "公尺"    
## [29,] "停工"     "何宗勳"     "珍愛藻礁" "減少"         "電廠"    
## [30,] "觀塘"     "公民"       "主席"     "藻礁公投"     "希望"

lda_model$plot() 
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)

這個LDA模型套件(text2vec),所找出的五個主題,LDAVis呈現主題有
1.府院方案-三接外推,供電能源轉型評估
2.政府(陳吉仲,蔡總統)與環團代表溝通
3.大潭藻礁位於桃園，桃園市長鄭文燦對藻礁議題的發言
4.綠政黨對藻礁議題連結就是支時重啟核四,但又是前總統馬英九封存核四的，藍營則反駁抹黑造謠等。
5.珍愛藻礁公投連署活動

http://127.0.0.1:4321/#topic=1&lambda=0.6&term=

七、網路分析

建立網路關係

# topics_name = c("桃園政府對藻礁生態/開發意向","政院三接方案","藍綠政治攻防","珍愛藻礁公投連署","政府/環團溝通")
#基本檔 
news_topic_basic <- tibble(
  topic=c(1:5),
  topics_name=topics_name
)

news1_topic_tmp<- news_topic %>% select (artUrl,topics_name) %>% 
  mutate(document=1:n())  %>% 
  select (document,topics_name,document)

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(topics_name)` instead of `topics_name` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

news1_topic_tmp<-  gather(news1_topic_tmp,topics_name,"gamma",-document)

news_gamma <-
news1_topic_tmp %>% 
  left_join(news_topic_basic)  %>% #每個document對應為1~5個topic 
  arrange(document)

## Joining, by = "topics_name"

#計算每個主題的新聞數量
news_gamma <- news_gamma %>% 
  group_by(document) %>% 
  filter(gamma == (max(gamma))) #gamma值最大歸屬於該主題 

table(news_gamma$topic)

## 
##   1   2   3   4   5 
##  94 154 111  94 126

link <- news_gamma %>% 
  # 只篩選 gamma 值大於 0.6
  filter(gamma > 0.6) %>%
  # 把 gamma 值當成 link 的權重
  rename(weight = gamma)

# 把 topic 欄位的 1 取代成 topic 1，依此類推，避免與 document 的 1 混淆 
link$topic <- link$topic %>%
  gsub(1, "topic 1", .) %>%
  gsub(2, "topic 2", .) %>%
  gsub(3, "topic 3", .) %>%
  gsub(4, "topic 4", .) %>%
  gsub(5, "topic 5", .)

# 建立無向圖
TopicNetwork <- graph_from_data_frame(d = link, directed = F)
TopicNetwork

## IGRAPH c7c62bf UNW- 77 72 -- 
## + attr: name (v/c), weight (e/n), topic (e/c)
## + edges from c7c62bf (vertex names):
##  [1] 20 --珍愛藻礁公投連署            26 --珍愛藻礁公投連署           
##  [3] 33 --珍愛藻礁公投連署            35 --珍愛藻礁公投連署           
##  [5] 37 --珍愛藻礁公投連署            38 --珍愛藻礁公投連署           
##  [7] 39 --珍愛藻礁公投連署            40 --珍愛藻礁公投連署           
##  [9] 43 --珍愛藻礁公投連署            44 --珍愛藻礁公投連署           
## [11] 45 --珍愛藻礁公投連署            48 --珍愛藻礁公投連署           
## [13] 52 --珍愛藻礁公投連署            61 --珍愛藻礁公投連署           
## [15] 63 --珍愛藻礁公投連署            64 --珍愛藻礁公投連署           
## + ... omitted several edges

set.seed(2021)
plot(TopicNetwork)

# 調整點線大小且不顯示節點名稱
#
plot(TopicNetwork, vertex.size = 10, edge.arrow.size = .5, vertex.label = NA)

# 顯示有超過 5 個關聯的節點名稱
plot(TopicNetwork, vertex.size = 10, edge.arrow.size = .5,
     vertex.label = ifelse(degree(TopicNetwork) > 5, V(TopicNetwork)$name, NA), vertex.label.font = 15)

> Gamma值>0.6 幾個topic分佈均頗為獨立

vertex_attr(TopicNetwork)

## $name
##  [1] "20"                          "26"                         
##  [3] "33"                          "35"                         
##  [5] "37"                          "38"                         
##  [7] "39"                          "40"                         
##  [9] "43"                          "44"                         
## [11] "45"                          "48"                         
## [13] "52"                          "61"                         
## [15] "63"                          "64"                         
## [17] "65"                          "74"                         
## [19] "82"                          "86"                         
## [21] "87"                          "89"                         
## [23] "90"                          "94"                         
## [25] "95"                          "104"                        
## [27] "125"                         "133"                        
## [29] "150"                         "165"                        
## [31] "175"                         "183"                        
## [33] "209"                         "211"                        
## [35] "221"                         "225"                        
## [37] "228"                         "256"                        
## [39] "257"                         "258"                        
## [41] "261"                         "262"                        
## [43] "278"                         "280"                        
## [45] "286"                         "308"                        
## [47] "343"                         "344"                        
## [49] "348"                         "352"                        
## [51] "369"                         "376"                        
## [53] "377"                         "378"                        
## [55] "382"                         "385"                        
## [57] "394"                         "395"                        
## [59] "440"                         "441"                        
## [61] "456"                         "457"                        
## [63] "464"                         "507"                        
## [65] "513"                         "521"                        
## [67] "527"                         "530"                        
## [69] "557"                         "566"                        
## [71] "568"                         "573"                        
## [73] "珍愛藻礁公投連署"            "藍綠政治攻防"               
## [75] "桃園政府對藻礁生態/開發意向" "政院三接方案"               
## [77] "政府/環團溝通"

設定連結屬性

# 設定連結的 type 為主題分類
E(TopicNetwork)$type <- link$topic
# 設定 weight
E(TopicNetwork)$weight <- link$weight

# edge_attr(TopicNetwork)

Density

edge_density(TopicNetwork)

## [1] 0.02460697

reciprocity(TopicNetwork)

## [1] 1

無向圖, reciprocity=1

transitivity(TopicNetwork, type="global")

## [1] 0

diameter(TopicNetwork, directed = F)

## [1] 1.565776

mean_distance(TopicNetwork, directed=F)

## [1] 1.88697

deg <- degree(TopicNetwork, mode = "all")
plot(TopicNetwork, 
     # 依照 degree 大小設定節點大小
     vertex.size = deg * 3,
     # 只顯示 degree 大於 5 的節點名稱
     vertex.label = ifelse(deg > 5, V(TopicNetwork)$name, NA))

> 珍愛藻礁公投連署最多degree , 概念上表示辨識為該topic的新聞數量最多

# Histogram of node degree
hist(deg, breaks = 1:vcount(TopicNetwork)-1, main = "Histogram of node degree")

# Degree distribution
deg.dist <- degree_distribution(TopicNetwork, cumulative=T, mode="all")
plot( x=0:max(deg), y=1-deg.dist, pch=19, cex=1.2, col="orange", xlab="Degree", ylab="Cumulative Frequency")

y軸為累積的出現頻率，Degree 1~2點的差距較大，表示degree 2的節點數最多

# Degree Centrality 

degree(TopicNetwork, mode = "all")

##                          20                          26 
##                           1                           1 
##                          33                          35 
##                           1                           1 
##                          37                          38 
##                           1                           1 
##                          39                          40 
##                           1                           1 
##                          43                          44 
##                           1                           1 
##                          45                          48 
##                           1                           1 
##                          52                          61 
##                           1                           1 
##                          63                          64 
##                           1                           1 
##                          65                          74 
##                           1                           1 
##                          82                          86 
##                           1                           1 
##                          87                          89 
##                           1                           1 
##                          90                          94 
##                           1                           1 
##                          95                         104 
##                           1                           1 
##                         125                         133 
##                           1                           1 
##                         150                         165 
##                           1                           1 
##                         175                         183 
##                           1                           1 
##                         209                         211 
##                           1                           1 
##                         221                         225 
##                           1                           1 
##                         228                         256 
##                           1                           1 
##                         257                         258 
##                           1                           1 
##                         261                         262 
##                           1                           1 
##                         278                         280 
##                           1                           1 
##                         286                         308 
##                           1                           1 
##                         343                         344 
##                           1                           1 
##                         348                         352 
##                           1                           1 
##                         369                         376 
##                           1                           1 
##                         377                         378 
##                           1                           1 
##                         382                         385 
##                           1                           1 
##                         394                         395 
##                           1                           1 
##                         440                         441 
##                           1                           1 
##                         456                         457 
##                           1                           1 
##                         464                         507 
##                           1                           1 
##                         513                         521 
##                           1                           1 
##                         527                         530 
##                           1                           1 
##                         557                         566 
##                           1                           1 
##                         568                         573 
##                           1                           1 
##            珍愛藻礁公投連署                藍綠政治攻防 
##                          25                           8 
## 桃園政府對藻礁生態/開發意向                政院三接方案 
##                          14                          14 
##               政府/環團溝通 
##                          11

centr_degree(TopicNetwork, mode = "all", normalized = T)

## $res
##  [1]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [26]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [51]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 25  8 14
## [76] 14 11
## 
## $centralization
## [1] 0.3043404
## 
## $theoretical_max
## [1] 5852

各主題 (topic1, topic2, topic3, topic4,topic5 ) 的節點的 Degree Centrality 較高
res : 節點中心度
centralization
theoretical_max : 最大中心化分數

# eigen_centrality(TopicNetwork, directed = F, weights = NA)
# centr_eigen(TopicNetwork, directed = F, normalized = T)

betweenness(TopicNetwork, directed = F) %>% head()

## 20 26 33 35 37 38 
##  0  0  0  0  0  0

edge_betweenness(TopicNetwork, directed = F)

##  [1] 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25  8 25  8 25 14 25 14 25
## [26] 25 14 11 11 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 25  8  8 14 14 14
## [51] 11 11 11 11 11  8 11 14 11 11 25  8 14 14  8  8 14 14 14 25 14 11

centr_betw(TopicNetwork, directed = F, normalized = T)

## $res
##  [1]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [20]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [39]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [58]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 300  28  91  91
## [77]  55
## 
## $centralization
## [1] 0.1040397
## 
## $theoretical_max
## [1] 216600

# Find cliques 
net.sym <- as.undirected(TopicNetwork, mode = "collapse", edge.attr.comb = list(weight = "sum", "ignore"))

#cliques(net.sym) # list of cliques       
sapply(cliques(net.sym), length) # clique sizes

##   [1] 1 1 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
##  [38] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
##  [75] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
## [112] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
## [149] 2

#largest_cliques(net.sym) # cliques with max number of nodes 
vcol <- rep("grey80", vcount(net.sym))
vcol[unlist(largest_cliques(net.sym))] <- "gold"
plot(as.undirected(net.sym), vertex.label=V(net.sym)$name, vertex.color=vcol)

# Community detection 

ceb <- cluster_edge_betweenness(TopicNetwork)

#dendPlot(ceb, mode="hclust")
plot(ceb, TopicNetwork)

phi_m <- topics_words %>% arrange(desc(phi)) %>% top_n(70)

## Selecting by phi

dtm <-phi_m %>% cast_dtm(topic, term, phi)

dtmm<-as.matrix(dtm)
dim(dtmm)

## [1]  5 62

network=graph_from_incidence_matrix(dtmm)

# plot
set.seed(3)
plot(network, ylim=c(-1,1), xlim=c(-1,1), asp = 0,
     vertex.label.cex=0.7, vertex.size=10, vertex.label.family = "Heiti TC Light")

## Warning in text.default(x, y, labels = labels, col = label.color, family =
## label.family, : font family not found in Windows font database

使用Topic Model分析大潭藻礁新聞

第三組讀書會

2021/05/15

組員:

一、套件取得及資料載入

套件

資料描述

二、Document Term Matrix (DTM)

資料前處理

將資料轉換為Document Term Matrix (DTM)

三、主題模型

建立LDA模型

利用LDA模型建立phi矩陣

尋找Topic的代表字

四、尋找最佳主題數

建立更多主題的主題模型

透過perplexity找到最佳主題數

產生LDAvis結果

產生LDAvis檔案，存至local端

五、LDA分析

選定5個主題數的主題模型

主題命名

Document 主題分佈

查看特定主題的文章

了解主題在時間的變化

去除筆數少月份

以比例了解主題時間變化

六、補充 - 不同訓練LDA模型套件

建立DTM matrix

LDA 模型

七、網路分析

建立網路關係

設定連結屬性

Density