ch1.套件取得及資料載入

套件

library(data.table)
library(ggplot2)
library(dplyr)
library(jiebaR)
library(tidytext)
library(stringr)
library(tm)
library(topicmodels)
library(purrr)
require(RColorBrewer)
require(readr)
require(NLP)
require(tidyr)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

資料描述

透過中山管院文字分析平台,載入聯合新聞網、蘋果新聞網、東森新聞網的新聞,搜尋關鍵字為「藻礁、三接、陳昭倫、潘忠政」,時間從2020/11/01到2021/05/15

filePath <- "/Users/hungwenchun/Desktop/data/sea"
setwd(filePath)
metadata <- fread("news_reef_articleMetaData.csv", encoding = "UTF-8")%>%
  mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>% 
  mutate(sentence=gsub("\n", "", sentence)) %>% 
  mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))

可以看到藻礁公投討論有幾波討論高點與趨勢

1.在228連假時連署呼聲的新聞報導數量增加

2.3/13藻礁公投連署書收69萬餘件,準備送進中選會進行公投成案

3.3/31農委會主委陳吉仲代表政府拜訪發起來潘忠政

4.4/22世界地球日蔡英文總統接見環團組織,含潘忠政對藻礁議題無交集 5.5/3政院提三接外推方案

metadata %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()

range(metadata$artDate)

斷句

# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
reaf_sentences <- strsplit(metadata$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
reaf_sentences <- data.frame(
                        artUrl = rep(metadata$artUrl, sapply(reaf_sentences, length)), 
                        artDate = rep(metadata$artDate, sapply(reaf_sentences, length)),
                        sentence = unlist(reaf_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))

reaf_sentences$sentence <- as.character(reaf_sentences$sentence)

reaf_sentences

建立斷詞辭典

jieba_tokenizer = worker(user="reef_dict.txt", stop_word="reef_stop_words.txt")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

斷詞

# 剛才的斷詞結果沒有使用新增的辭典,因此我們重新進行斷詞,再計算各詞彙在各文章中出現的次數
reaf_words <- reaf_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
reaf_words

#Ch1. Document Term Matrix (DTM)

dtm <- reaf_words %>% cast_dtm(artUrl, word, count)
dtm
inspect(dtm[1:10,1:10])

ch2. 主題模型

建立LDA模型

#lda <- LDA(dtm, k = 2, control = list(seed = 2021))
lda <- LDA(dtm, k = 4, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs")

利用LDA模型建立phi矩陣

## 利用LDA模型建立phi矩陣
topics_words <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words

尋找Topic的代表字

terms依照各主題的phi值由大到小排序,列出前10大

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

NA
NA

ch3. 尋找最佳主題數

建立更多主題的主題模型

嘗試2、4、6、10、15個主題數,將結果存起來,再做進一步分析。

#ldas = c()
#topics = c(2,4,6,10,15)
  #for(topic in topics){
   #start_time <- Sys.time()
   #lda <- LDA(dtm, k = topic, control = list(seed = 2021))
   #ldas =c(ldas,lda)
   #print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
   #save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
  #}

載入每個主題的LDA結果

load("ldas_result.rdata")

透過perplexity找到最佳主題數

選擇分成四個主題

topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

create LDAvis所需的json function 此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。


topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決,主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}

產生LDAvis結果

the_lda = ldas[[2]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
serVis(json_res,open.browser = T)

產生LDAvis檔案,存至local端

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

主題1 “LDAvis” 主題2 “LDAvis” 主題3 “LDAvis” 主題4 “LDAvis”

ch4. LDA分析

選定4個主題數的主題模型

the_lda = ldas[[2]] ## 選定topic 為 4 的結果
topics_words <- tidy(the_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

去除共通詞彙,藻礁、公投、連署、今天、提出、希望、影響、政府

removed_word = c("藻礁","公投","連署","今天","提出","希望","影響","政府")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

主題命名

topics_name = c("政黨議題","能源議題","生態保育議題","方案議題")

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)

現在我們看每一篇的文章分佈了!

查看特定主題的文章

  • 透過找到特定文章的分佈進行排序之後,可以看到此主題的比重高的文章在討論什麼。

政黨議題:多為政治人物對藻礁公投的支持與否的看法 ex:由環保團體發起的「珍愛藻礁公投連署」已突破50萬大關,國民黨表態大力支持、時代力量黨主席陳椒華今於臉書大力為公投催連署

news_topic %>%
  arrange(desc(`政黨議題`)) %>% head(30) 

方案議題:對藻礁公投的訴求和議題的討論 ex:行政院提出三接外推455公尺方案,總計離岸1.2公里,宣稱對沿岸影響更輕微、不破壞礁體,還宣稱是「雙贏」方案

news_topic %>%
  arrange(desc(`方案議題`)) %>% head(30) 

生態保育議題:蓋第三天然氣接收站對藻礁與生態環境的破壞 ex:桃園大潭藻礁生態因將蓋第三天然氣接收站,恐衝擊生態、除了桃園沿岸的藻礁之外,新竹的新豐海岸也有藻礁,學者指出,當地的藻礁是全台灣最南端的藻礁,具有特殊意義

news_topic %>%
   arrange(desc(`生態保育議題`)) %>%head(30) 

能源議題:藻礁公投對台塑、天然氣、燃煤等能源的影響 ex:中油選在觀塘蓋天然氣第三接收站,與供電的時程息息相關、若無大潭燃氣供電,可能回到2014年前中火與麥寮電廠火力全開的排碳量、大潭藻礁設置中油三接站引發爭議,外傳總統蔡英文找上台塑,希望由台塑位在麥寮機組由燃煤改為燃煤,作為三接的替代方案

news_topic %>%
  arrange(desc(`能源議題`)) %>%head(30) 

了解主題在時間的變化

可以了解到在三月份,這四種議題都被討論到,而四、五月較針對藻礁本身的公投議題去討論,五月最主要的議題王美花召開記者會,提出政院三接外推案

news_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors[c(1,5,8,12)])+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

以比例了解主題時間變化

2月最主要是政黨之間對藻礁公投的看法,可能會模糊焦點,而之後較針對此議題去做討論

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_bar(stat = "identity") + ylab("proportion") + 
    scale_fill_manual(values=mycolors[c(1,5,8,12)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

---
title: "使用主題模型分析藻礁公投中文新聞資料"
author: "洪玟君"
date: "2021/05/17"
output:
  html_notebook:
    toc: yes
    toc_float: yes
    highlight: pygments
    theme: flatly
    css: style.css
  html_document:
    toc: yes
    df_print: paged
---

```{r echo = T, results = 'hide'}
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
```
# ch1.套件取得及資料載入
## 套件
```{r}
library(data.table)
library(ggplot2)
library(dplyr)
library(jiebaR)
library(tidytext)
library(stringr)
library(tm)
library(topicmodels)
library(purrr)
require(RColorBrewer)
require(readr)
require(NLP)
require(tidyr)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
```

## 資料描述

> 透過中山管院文字分析平台，載入聯合新聞網、蘋果新聞網、東森新聞網的新聞，搜尋關鍵字為「藻礁、三接、陳昭倫、潘忠政」，時間從2020/11/01到2021/05/15

```{r}
filePath <- "/Users/hungwenchun/Desktop/data/sea"
setwd(filePath)
metadata <- fread("news_reef_articleMetaData.csv", encoding = "UTF-8")%>%
  mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>% 
  mutate(sentence=gsub("\n", "", sentence)) %>% 
  mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))
```

> 可以看到藻礁公投討論有幾波討論高點與趨勢

1.在228連假時連署呼聲的新聞報導數量增加

2.3/13藻礁公投連署書收69萬餘件,準備送進中選會進行公投成案

3.3/31農委會主委陳吉仲代表政府拜訪發起來潘忠政

4.4/22世界地球日蔡英文總統接見環團組織,含潘忠政對藻礁議題無交集 5.5/3政院提三接外推方案
```{r}
metadata %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()
```
```{r}
range(metadata$artDate)
```
## 斷句
```{r}
# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
reaf_sentences <- strsplit(metadata$sentence,"[。！；？!?;]")
# 將每句句子，與他所屬的文章連結配對起來，整理成一個dataframe
reaf_sentences <- data.frame(
                        artUrl = rep(metadata$artUrl, sapply(reaf_sentences, length)), 
                        artDate = rep(metadata$artDate, sapply(reaf_sentences, length)),
                        sentence = unlist(reaf_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))

reaf_sentences$sentence <- as.character(reaf_sentences$sentence)

reaf_sentences
```

## 建立斷詞辭典
```{r}
jieba_tokenizer = worker(user="reef_dict.txt", stop_word="reef_stop_words.txt")
```

```{r}
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
```

## 斷詞
```{r}
# 剛才的斷詞結果沒有使用新增的辭典，因此我們重新進行斷詞，再計算各詞彙在各文章中出現的次數
reaf_words <- reaf_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
reaf_words
```

#Ch1. Document Term Matrix (DTM)

```{r}
dtm <- reaf_words %>% cast_dtm(artUrl, word, count)
dtm
inspect(dtm[1:10,1:10])
```
# ch2. 主題模型
## 建立LDA模型
```{r}
#lda <- LDA(dtm, k = 2, control = list(seed = 2021))
lda <- LDA(dtm, k = 4, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs")
```

## 利用LDA模型建立phi矩陣
```{r}
## 利用LDA模型建立phi矩陣
topics_words <- tidy(lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words
```

## 尋找Topic的代表字

> terms依照各主題的phi值由大到小排序，列出前10大

```{r}
topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()
  
 
```

# ch3. 尋找最佳主題數

## 建立更多主題的主題模型

> 嘗試2、4、6、10、15個主題數，將結果存起來，再做進一步分析。

```{r eval=FALSE}
#ldas = c()
#topics = c(2,4,6,10,15)
  #for(topic in topics){
   #start_time <- Sys.time()
   #lda <- LDA(dtm, k = topic, control = list(seed = 2021))
   #ldas =c(ldas,lda)
   #print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
   #save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
  #}
```


> 載入每個主題的LDA結果

```{r}
load("ldas_result.rdata")
```

## 透過perplexity找到最佳主題數
選擇分成四個主題
```{r}
topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")
```

> create LDAvis所需的json function
此function是將前面使用 "LDA function"所建立的model，轉換為"LDAVis"套件的input格式。

```{r}

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決，主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}
```

## 產生LDAvis結果

```{r eval=FALSE}

the_lda = ldas[[2]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
serVis(json_res,open.browser = T)
```
### 產生LDAvis檔案，存至local端
```{r eval=FALSE}
serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))
```
主題1
!["LDAvis"](tp1.PNG)
主題2
!["LDAvis"](tp2.PNG)
主題3
!["LDAvis"](tp3.PNG)
主題4
!["LDAvis"](tp4.PNG)

# ch4. LDA分析

## 選定4個主題數的主題模型
```{r}
the_lda = ldas[[2]] ## 選定topic 為 4 的結果
```

```{r}
topics_words <- tidy(the_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)
```

> terms依照各主題的phi值由大到小排序

```{r}
topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()
```

>去除共通詞彙，藻礁、公投、連署、今天、提出、希望、影響、政府

```{r}
removed_word = c("藻礁","公投","連署","今天","提出","希望","影響","政府")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()
```

### 主題命名
```{r}
topics_name = c("政黨議題","能源議題","生態保育議題","方案議題")
```

## Document 主題分佈
```{r}
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)
```

> 現在我們看每一篇的文章分佈了！

### 查看特定主題的文章
+ 透過找到特定文章的分佈進行排序之後，可以看到此主題的比重高的文章在討論什麼。

政黨議題:多為政治人物對藻礁公投的支持與否的看法
ex:由環保團體發起的「珍愛藻礁公投連署」已突破50萬大關，國民黨表態大力支持、時代力量黨主席陳椒華今於臉書大力為公投催連署
```{r ,eval=FALSE}
news_topic %>%
  arrange(desc(`政黨議題`)) %>% head(30) 
```

方案議題:對藻礁公投的訴求和議題的討論
ex:行政院提出三接外推455公尺方案，總計離岸1.2公里，宣稱對沿岸影響更輕微、不破壞礁體，還宣稱是「雙贏」方案
```{r ,eval=FALSE}
news_topic %>%
  arrange(desc(`方案議題`)) %>% head(30) 
```

生態保育議題:蓋第三天然氣接收站對藻礁與生態環境的破壞
ex:桃園大潭藻礁生態因將蓋第三天然氣接收站，恐衝擊生態、除了桃園沿岸的藻礁之外，新竹的新豐海岸也有藻礁，學者指出，當地的藻礁是全台灣最南端的藻礁，具有特殊意義
```{r ,eval=FALSE}
news_topic %>%
   arrange(desc(`生態保育議題`)) %>%head(30) 
```

能源議題:藻礁公投對台塑、天然氣、燃煤等能源的影響
ex:中油選在觀塘蓋天然氣第三接收站，與供電的時程息息相關、若無大潭燃氣供電，可能回到2014年前中火與麥寮電廠火力全開的排碳量、大潭藻礁設置中油三接站引發爭議，外傳總統蔡英文找上台塑，希望由台塑位在麥寮機組由燃煤改為燃煤，作為三接的替代方案
```{r ,eval=FALSE}
news_topic %>%
  arrange(desc(`能源議題`)) %>%head(30) 
```


### 了解主題在時間的變化
可以了解到在三月份，這四種議題都被討論到，而四、五月較針對藻礁本身的公投議題去討論，五月最主要的議題王美花召開記者會，提出政院三接外推案
```{r warning=FALSE}
news_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors[c(1,5,8,12)])+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
```

### 以比例了解主題時間變化
2月最主要是政黨之間對藻礁公投的看法，可能會模糊焦點，而之後較針對此議題去做討論
```{r warning=FALSE}
news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_bar(stat = "identity") + ylab("proportion") + 
    scale_fill_manual(values=mycolors[c(1,5,8,12)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
```




