ch0.環境設定和套件取得及資料載入

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""

套件

library(data.table)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(tidytext)
library(stringr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(topicmodels)
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
## 
##     transpose
library(slam)
## 
## Attaching package: 'slam'
## The following object is masked from 'package:data.table':
## 
##     rollup
library(LDAvis)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
require(RColorBrewer)
## Loading required package: RColorBrewer
require(plotly)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

資料描述

透過中山管院文字分析平台,載入聯合新聞網、蘋果新聞網、東森新聞網的新聞,搜尋關鍵字為「防疫」「防疫措施」時間從2020/01/01到2021/05/13。

metadata <- fread("quarantine metadata.csv", encoding = "UTF-8")

可以看有關防疫措施的討論在四月過後新聞報導大幅下降,直到今年一月和五月才又上升。

metadata%>% 
  mutate(artDate = as.Date(artDate))%>%
  group_by(artDate)%>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()+
    geom_line(aes(x=artDate,y=count))->plot

ggplotly(plot)

#Ch1. Document Term Matrix (DTM)

資料前處理

使用默認參數初始化一個斷詞引擎

jieba_tokenizer = worker()
news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

計算每篇文章各token出現次數

tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
  count(artUrl, word) %>%
  rename(count=n)
tokens %>% head(20)

將資料轉換為Document Term Matrix (DTM)

dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm
## <<DocumentTermMatrix (documents: 1455, terms: 33608)>>
## Non-/sparse entries: 206580/48693060
## Sparsity           : 100%
## Maximal term length: 9
## Weighting          : term frequency (tf)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 34/66
## Sparsity           : 66%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## Sample             :
##                                              Terms
## Docs                                          一名 一定 人員 入境 口罩 大家
##   https://news.ebc.net.tw/news/article/195006    1    1    1    1    4    1
##   https://news.ebc.net.tw/news/article/195218    0    1    0    1    3    0
##   https://news.ebc.net.tw/news/article/195291    0    0    0    0    6    0
##   https://news.ebc.net.tw/news/article/195308    0    1    0    0    6    0
##   https://news.ebc.net.tw/news/article/195324    2    0    1    1    1    0
##   https://news.ebc.net.tw/news/article/195409    0    0    1    0    0    0
##   https://news.ebc.net.tw/news/article/195418    2    0    0    1    5    0
##   https://news.ebc.net.tw/news/article/195424    0    0    0    0    0    0
##   https://news.ebc.net.tw/news/article/195479    0    0    1    0    0    0
##   https://news.ebc.net.tw/news/article/195605    0    0    0    0    0    0
##                                              Terms
## Docs                                          大陸 女性 已經 不必
##   https://news.ebc.net.tw/news/article/195006    2    1    1    1
##   https://news.ebc.net.tw/news/article/195218    2    0    1    0
##   https://news.ebc.net.tw/news/article/195291    1    0    0    0
##   https://news.ebc.net.tw/news/article/195308    3    0    1    1
##   https://news.ebc.net.tw/news/article/195324    0    0    1    0
##   https://news.ebc.net.tw/news/article/195409    0    0    0    0
##   https://news.ebc.net.tw/news/article/195418    1    0    0    0
##   https://news.ebc.net.tw/news/article/195424    0    0    0    0
##   https://news.ebc.net.tw/news/article/195479    0    0    0    0
##   https://news.ebc.net.tw/news/article/195605    1    0    0    0

ch2. 主題模型

建立LDA模型

#lda <- LDA(dtm, k = 2, control = list(seed = 2021))
# lda <- LDA(dtm, k = 2, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs") #調整alpha即delta
load("ldas_result.rdata")
lda = ldas[[1]]
lda
## A LDA_VEM topic model with 6 topics.

利用LDA模型建立phi矩陣

topics_words <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words

尋找Topic的代表字

terms依照各主題的phi值由大到小排序,列出前10大

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

建立更多主題的主題模型

嘗試6個主題數,將結果存起來,再做進一步分析。

ldas = c()
topics = c(6)
for(topic in topics){
   start_time <- Sys.time()
   lda <- LDA(dtm, k = topic, control = list(seed = 2021))
   ldas =c(ldas,lda)
   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
   save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
 }

載入每個主題的LDA結果

load("ldas_result.rdata")

產生LDAvis結果

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決,主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}
the_lda = ldas[[1]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
serVis(json_res,open.browser = T)

產生LDAvis檔案,存至local端

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

ch4. LDA分析

選定4個主題數的主題模型

the_lda = ldas[[1]] ## 選定topic 為 4 的結果
topics_words <- tidy(the_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

去除共通詞彙,

removed_word = c("表示","可以","沒有","口罩","疫情","我們","防疫","台灣","民眾","大家","他們","就是","還是","現在")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

### 主題命名

topics_name = c("工作","醫院和數據","疫苗進度","公共活動及業者","執法單位","境外移入")

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)

看每一篇的文章分佈了!

查看特定主題的文章

  • 透過找到特定文章的分佈進行排序之後,可以看到此主題的比重高的文章在討論什麼。
news_topic %>%
  arrange(desc(`境外移入`)) %>%head(10) 

了解主題在時間的變化

news_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors[c(1,2,3,5,8,12)])+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

以比例了解主題時間變化

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
    scale_fill_manual(values=mycolors[c(1,2,3,5,8,12)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))