組員:

1.載入套件&資料

1.1套件

library(data.table)
library(ggplot2)
library(dplyr)
library(jiebaR)
library(tidytext)
library(stringr)
library(tm)
library(topicmodels)
library(purrr)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

1.2資料描述

透過文字分析平台,載入聯合新聞網、蘋果新聞網、東森新聞網的新聞,搜尋關鍵字為「藻礁、三接、陳昭倫、潘忠政」,時間從2020-11-01到2021-05-15

1.3載入資料

metadata <- fread("./data/news_reef_articleMetaData.csv", encoding = "UTF-8")

從下圖可以看到藻礁公投討論有幾波討論高點
1)在228連假時連署呼聲的新聞報導數量增加
2)3/13藻礁公投連署書收69萬餘件,準備送進中選會進行公投成案
3)3/31農委會主委陳吉仲代表政府拜訪發起來潘忠政
4)4/22世界地球日蔡英文總統接見環團組織,含潘忠政對藻礁議題無交集
5)5/3行政院提三接外推方案

metadata %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="gray")+
    geom_point(aes(colour = artDate))

2.Document Term Matrix (DTM)

2.1資料前處理

2.1.1使用默認參數初始化一個斷詞引擎

jieba_tokenizer = worker(user="reef_dict.txt", stop_word = "./data/reef_stop_words.txt")

news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

2.1.2計算每篇文章各token出現次數

tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))  %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens 

2.2將資料轉換為Document Term Matrix (DTM)

dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm
## <<DocumentTermMatrix (documents: 573, terms: 12129)>>
## Non-/sparse entries: 69591/6880326
## Sparsity           : 99%
## Maximal term length: 7
## Weighting          : term frequency (tf)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 11/89
## Sparsity           : 89%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## Sample             :
##                                              Terms
## Docs                                          一年 人士 人事 人事安排 人選 三方
##   https://news.ebc.net.tw/news/article/250089    1    1    2        2    1    1
##   https://news.ebc.net.tw/news/article/250897    0    2    0        0    0    0
##   https://news.ebc.net.tw/news/article/250926    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251058    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251166    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251281    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251438    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251477    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251725    0    0    0        0    0    0
##   https://news.ebc.net.tw/news/article/251792    0    0    0        0    0    0
##                                              Terms
## Docs                                          上是 上海 大陸 不論是
##   https://news.ebc.net.tw/news/article/250089    1    1    3      1
##   https://news.ebc.net.tw/news/article/250897    0    0    0      0
##   https://news.ebc.net.tw/news/article/250926    0    0    0      0
##   https://news.ebc.net.tw/news/article/251058    0    0    0      0
##   https://news.ebc.net.tw/news/article/251166    0    0    0      0
##   https://news.ebc.net.tw/news/article/251281    0    0    0      0
##   https://news.ebc.net.tw/news/article/251438    0    0    0      0
##   https://news.ebc.net.tw/news/article/251477    0    0    0      0
##   https://news.ebc.net.tw/news/article/251725    0    0    0      0
##   https://news.ebc.net.tw/news/article/251792    0    0    0      0

3.主題模型

3.1建立LDA模型

# lda <- LDA(dtm, k = 3, control = list(seed = 2021))
# lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs")
# alpha=50/k delta在TMWS平台測試為0.2有較好的效果(各主題的中心較遠)
lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs")
#調整alpha即delta
lda
## A LDA_Gibbs topic model with 5 topics.

3.2利用LDA模型建立phi矩陣

topics_words <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words

3.3尋找Topic的代表字

terms依照各主題的phi值由大到小排序,列出前10大

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

4.尋找最佳主題數

4.1建立更多主題的主題模型

我們嘗試2,4,6,10,15個主題數,將結果存起來,再做進一步分析。 此部分需要跑一段時間,已經將跑完的檔案存成ldas_result.rdata,可以直接載入

 ldas = c()
 topics = c(3,4,5,7,9)
 for(topic in topics){
   start_time <- Sys.time()
   lda <- LDA(dtm, k = topic, control = list(seed = 2021))
   ldas =c(ldas,lda)
   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
   save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
 }

載入每個主題的LDA結果

load("ldas_result.rdata")

透過perplexity找到最佳主題數

topics = c(3,4,5,7,9)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

  • 主題數為5 Perplexity 低且下降趨緩

create LDAvis所需的json function
此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決,主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}

產生LDAvis結果

for(lda in ldas){

   k = lda@k ## lda 主題數
   if(k==2){next}
   json_res <- topicmodels_json_ldavis(lda,dtm)
   # serVis(json_res,open.browser = T)
   lda_dir =  paste0(k,"_ldavis")
   if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}

   serVis(json_res, out.dir =lda_dir, open.browser = F)

   writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
}


the_lda = ldas[[3]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
#這一行在windows並未開啟LdaVis網頁?? 
serVis(json_res,open.browser = T)

產生LDAvis檔案,存至local端

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

ch4. LDA分析

選定5個主題數的主題模型

# the_lda = ldas[[3]] ## 選定topic 為 5 的結果
the_lda_5 <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs") #主題數分為5個 
topics_words <- tidy(the_lda_5, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(50)

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(15, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

去除共通詞彙,

removed_word = c("藻礁","表示","可以")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

主題命名

生態保護-討論主題為保育藻礁及海岸生態

# topics_name = c("生態保護","政府方案","政府與環團溝通","反綠營","能源政策") 政府與觀注方攻防
topics_name = c("珍愛藻礁公投連署","政院三接外推方案","府方試圖找環團溝通","抹黑造謠","藍綠政黨攻防")

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda_5)
doc_pro <- tmResult$topics #每篇文章的機率分佈 
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics) #將document_topics轉成dataframe 
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)

現在我們看每一篇的文章分佈了!

查看特定主題的文章

  • 透過找到特定文章的分佈進行排序之後,可以看到此主題的比重高的文章在討論什麼。
 news_topic %>% 
  arrange(desc(`珍愛藻礁公投連署`)) %>% select(artTitle,artDate,`珍愛藻礁公投連署`) %>% head(30) 

“珍愛藻礁公投連署” 主題多為3月中旬前藻礁公投連署訴求及連署活動

news_topic %>%
  arrange(desc(`政院三接外推方案`)) %>%  select(artTitle,artDate,`政院三接外推方案`) %>% head(30) 

“政院三接外推方案” 主題多為在確定進行公投後,5/3 政院所提的三接外推方案,以影響民眾投下不同意的動向

news_topic %>%
  arrange(desc(`府方試圖找環團溝通`)) %>% select(artTitle,artDate,`府方試圖找環團溝通`) %>% head(30) 

政府在3月下旬確定連署人數達70萬,開始找陳吉仲與環團溝通,與1月前的態度不同

news_topic %>%
  arrange(desc(`藍綠政黨攻防`)) %>% select(artTitle,artDate,`藍綠政黨攻防`) %>% head(30) 

“藍綠政黨攻防” 主題多為讓環保議題轉為政治議題,藍綠政治人物發表評論

news_topic %>%
  arrange(desc(`抹黑造謠`)) %>% select(artTitle,artDate,`抹黑造謠`) %>% head(30) 

了解主題在時間的變化

news_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m')) %>% #每月統計
  summarise_if(is.numeric, sum, na.rm = TRUE) %>% #每月發表在各主題的篇數
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors[c(1,5,8,12,15)])+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

3月份為連署階段, 國民黨江啟臣表態支持後,就有更多的政治人員表態。
5月最主要的議題王美花召開記者會,提出政院三接外推案

去除筆數少月份

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>% #只找202011~202105 
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + #bar圖 
    scale_fill_manual(values=mycolors[c(1,5,8,12,15)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

以比例了解主題時間變化

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
    scale_fill_manual(values=mycolors[c(1,5,8,12,18)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

補充 - 不同訓練LDA模型套件

參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation

library(text2vec)
## Warning: package 'text2vec' was built under R version 4.0.5
## 
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
## 
##     perplexity
library(udpipe)
## Warning: package 'udpipe' was built under R version 4.0.4
tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

建立DTM matrix

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dim(dtm)
## [1]   573 12129
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 20)#少於30的matrix
dim(dtm_clean)
## [1] 573 870

LDA 模型

set.seed(20190)

topic_n = 5
#lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.004) #效果不錯
#以alpha 0.15 Beta=0.004 可得到獨立的主題
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.15, topic_word_prior = 0.004)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)
## INFO  [16:18:15.977] early stopping at 140 iteration 
## INFO  [16:18:16.212] early stopping at 50 iteration

這個比topicmodels的package跑快超多倍

一樣可以用LDAvis的套件來看

lda_model$get_top_words(n = 30, lambda = 0.5) ## 查看 前30主題字
##       [,1]     [,2]       [,3]     [,4]     [,5]    
##  [1,] "天然氣" "連署"     "藻礁"   "溝通"   "國民黨"
##  [2,] "接收站" "公投"     "桃園"   "環團"   "民進黨"
##  [3,] "三接"   "珍愛"     "生態"   "吉仲"   "英文"  
##  [4,] "公頃"   "萬份"     "保護"   "方案"   "立委"  
##  [5,] "經濟部" "小組"     "保育"   "政府"   "臉書"  
##  [6,] "電廠"   "中選會"   "環境"   "農委會" "台灣"  
##  [7,] "影響"   "藻礁"     "海岸"   "總統"   "藻礁"  
##  [8,] "供電"   "門檻"     "鄭文燦" "對話"   "核四"  
##  [9,] "方案"   "領銜"     "大潭"   "聯盟"   "公投"  
## [10,] "燃煤"   "總部"     "環保"   "公投"   "馬英九"
## [11,] "能源"   "民眾"     "市長"   "提出"   "支持"  
## [12,] "王美花" "潘忠政說" "市府"   "主委"   "執政黨"
## [13,] "公里"   "收到"     "保護區" "潘忠政" "重啟"  
## [14,] "穩定"   "工作"     "國人"   "會議"   "政治"  
## [15,] "行政院" "參與"     "開發"   "見面"   "政府"  
## [16,] "興建"   "成案"     "桃園市" "說明"   "議題"  
## [17,] "第三"   "公民"     "時代"   "討論"   "反對"  
## [18,] "蘇貞昌" "進入"     "中南部" "意見"   "總統"  
## [19,] "發電"   "志工"     "議題"   "總統府" "抹黑"  
## [20,] "增加"   "搶救"     "關注"   "雙方"   "政黨"  
## [21,] "面積"   "何宗勳"   "加入"   "朋友"   "處理"  
## [22,] "電力"   "份數"     "自然"   "共識"   "主席"  
## [23,] "孫大千" "規定"     "當地"   "雙贏"   "江啟"  
## [24,] "減少"   "協會"     "生物"   "程序"   "過去"  
## [25,] "機組"   "全台"     "觀新藻" "問題"   "王浩宇"
## [26,] "中油"   "投案"     "市政府" "代表"   "限制"  
## [27,] "燃氣"   "珊瑚"     "成立"   "會面"   "執政"  
## [28,] "大潭"   "整理"     "狀況"   "議題"   "網路"  
## [29,] "工業港" "學生"     "受到"   "同意"   "批評"  
## [30,] "轉型"   "發起"     "重視"   "團體"   "委員會"
lda_model$plot() 
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)

這個LDA模型套件(text2vec),所找出的五個主題,LDAVis呈現主題有
1.府院方案-三接外推,供電能源轉型評估
2.政府(陳吉仲,蔡總統)與環團代表溝通
3.大潭藻礁位於桃園,桃園市長鄭文燦對藻礁議題的發言
4.綠政黨對藻礁議題連結就是支時重啟核四,但又是前總統馬英九封存核四的,藍營則反駁抹黑造謠等。
5.珍愛藻礁公投連署活動