Ch0.套件取得及資料載入

套件

library(data.table)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(jiebaR)

## Loading required package: jiebaRD

library(tidytext)
library(stringr)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(topicmodels)
library(purrr)

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:data.table':
## 
##     transpose

require(RColorBrewer)

## Loading required package: RColorBrewer

mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

資料描述

透過中山管院文字分析平台，載入聯合新聞網、蘋果新聞網、東森新聞網的新聞，搜尋關鍵字為「疫苗」，時間從2020/10/01到2021/05/09。

metadata <- fread("news_articleMetaData.csv", encoding = "UTF-8")

可以看到疫苗討論在2月過後的新聞報導數量增加

metadata %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()

## `summarise()` ungrouping output (override with `.groups` argument)

Ch1. Document Term Matrix (DTM)

資料前處理

使用默認參數初始化一個斷詞引擎

jieba_tokenizer = worker()
news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

計算每篇文章各token出現次數

tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter((!str_detect(word, regex("[0-9a-zA-Z]"))) | str_detect(word, regex("[Aa][Zz]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens %>% head(20)

將資料轉換為Document Term Matrix (DTM)

dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm

## <<DocumentTermMatrix (documents: 2870, terms: 33133)>>
## Non-/sparse entries: 421416/94670294
## Sparsity           : 100%
## Maximal term length: 14
## Weighting          : term frequency (tf)

inspect(dtm[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 33/67
## Sparsity           : 67%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## Sample             :
##                                              Terms
## Docs                                          一出 上漲 公司 周一 報導 宣布
##   https://news.ebc.net.tw/news/article/235190    1    4    1    1    1    1
##   https://news.ebc.net.tw/news/article/235496    0    0    4    0    0    0
##   https://news.ebc.net.tw/news/article/235549    0    0    0    0    0    1
##   https://news.ebc.net.tw/news/article/235666    0    0    5    1    0    0
##   https://news.ebc.net.tw/news/article/235696    0    0    1    0    0    2
##   https://news.ebc.net.tw/news/article/236685    0    0    1    0    0    0
##   https://news.ebc.net.tw/news/article/236879    0    0    2    0    1    0
##   https://news.ebc.net.tw/news/article/236890    0    0    1    0    1    0
##   https://news.ebc.net.tw/news/article/236967    0    0    5    0    0    0
##   https://news.ebc.net.tw/news/article/236977    0    0    0    0    1    1
##                                              Terms
## Docs                                          技術 指數 數據 暴漲
##   https://news.ebc.net.tw/news/article/235190    1    1    1    1
##   https://news.ebc.net.tw/news/article/235496    0    0    2    0
##   https://news.ebc.net.tw/news/article/235549    0    0    4    0
##   https://news.ebc.net.tw/news/article/235666    0    0    7    0
##   https://news.ebc.net.tw/news/article/235696    0    0    0    0
##   https://news.ebc.net.tw/news/article/236685    0    0    2    0
##   https://news.ebc.net.tw/news/article/236879    1    0    3    0
##   https://news.ebc.net.tw/news/article/236890    1    0    3    0
##   https://news.ebc.net.tw/news/article/236967    0    0    3    0
##   https://news.ebc.net.tw/news/article/236977    0    0    0    0

ch2. 主題模型

建立LDA模型

# lda <- LDA(dtm, k = 2, control = list(seed = 2021))
# lda <- LDA(dtm, k = 2, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs") #調整alpha即delta
#

load("ldas_result.rdata")
lda = ldas[[2]]

利用LDA模型建立phi矩陣

topics_words <- tidy(lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words

尋找Topic的代表字

terms依照各主題的phi值由大到小排序，列出前10大

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

#ch3. 尋找最佳主題數

建立更多主題的主題模型

嘗試2、4、6、10、15個主題數，將結果存起來，再做進一步分析。此部分需要跑一段時間，已經將跑完的檔案存成ldas_result.rdata，可以直接載入

# ldas = c()
# topics = c(2,4,6,10,15)
# for(topic in topics){
#   start_time <- Sys.time()
#   lda <- LDA(dtm, k = topic, control = list(seed = 2021))
#   ldas =c(ldas,lda)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
# }

載入每個主題的LDA結果

load("ldas_result.rdata")

透過perplexity找到最佳主題數

topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

補充 ldatuning

參考 https://rpubs.com/siri/ldatuning

Minimization: Arun2010、CaoJuan2009 Maximization: Deveaud2014、Griffiths2004

# if(!('ldatuning' %in% existing)){install.packages(ldatuning)}
# library("ldatuning")
# result <- FindTopicsNumber(
#   news_dtm,
#   topics = topics,
#   metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
#   method = "Gibbs",
#   control = list(seed = 2020),
#   mc.cores = 2L,
#   verbose = TRUE
# )
# FindTopicsNumber_plot(result)

產生LDAvis結果

create LDAvis所需的json function 此function是將前面使用 “LDA function”所建立的model，轉換為“LDAVis”套件的input格式。

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決，主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}

the_lda = ldas[[2]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
serVis(json_res,open.browser = T)

產生LDAvis檔案，存至local端

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

ch3. LDA分析

選定4個主題數的主題模型

the_lda = ldas[[2]] ## 選定topic 為 4 的結果

topics_words <- tidy(the_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

去除共通詞彙，

removed_word = c("肺炎","新冠","疫苗","接種","目前","表示","沒有")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

主題命名

topics_name = c("AZ疫苗","台灣疫苗施打","疫苗研發進度","輝瑞疫苗")

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)

現在我們看每一篇的文章分佈了！

查看特定主題的文章

透過找到特定文章的分佈進行排序之後，可以看到此主題的比重高的文章在討論什麼。

news_topic %>%
  arrange(desc(`AZ疫苗`)) %>%head(10)

了解主題在時間的變化

news_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors[c(1,5,8,12)])+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

去除筆數少月份

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
    scale_fill_manual(values=mycolors[c(1,5,8,12)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

以比例了解主題時間變化

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
    scale_fill_manual(values=mycolors[c(1,5,8,12)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

補充 - 不同訓練LDA模型套件

參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation

library(text2vec)

## 
## Attaching package: 'text2vec'

## The following object is masked from 'package:topicmodels':
## 
##     perplexity

library(udpipe)
tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))

建立DTM matrix

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)

## [1] 2870 2585

LDA 模型

set.seed(2019)

topic_n = 4

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)

## INFO [2021-05-11 18:13:09] iter 10 loglikelihood = -3401386.894
## INFO [2021-05-11 18:13:09] iter 20 loglikelihood = -3319596.164
## INFO [2021-05-11 18:13:09] iter 30 loglikelihood = -3301715.557
## INFO [2021-05-11 18:13:10] iter 40 loglikelihood = -3292816.997
## INFO [2021-05-11 18:13:10] iter 50 loglikelihood = -3286405.602
## INFO [2021-05-11 18:13:10] iter 60 loglikelihood = -3281816.286
## INFO [2021-05-11 18:13:11] iter 70 loglikelihood = -3278442.914
## INFO [2021-05-11 18:13:11] iter 80 loglikelihood = -3274703.318
## INFO [2021-05-11 18:13:12] iter 90 loglikelihood = -3271543.279
## INFO [2021-05-11 18:13:12] iter 100 loglikelihood = -3269433.890
## INFO [2021-05-11 18:13:12] iter 110 loglikelihood = -3267137.383
## INFO [2021-05-11 18:13:13] iter 120 loglikelihood = -3265079.295
## INFO [2021-05-11 18:13:13] iter 130 loglikelihood = -3264500.213
## INFO [2021-05-11 18:13:14] iter 140 loglikelihood = -3263332.312
## INFO [2021-05-11 18:13:14] iter 150 loglikelihood = -3261413.076
## INFO [2021-05-11 18:13:14] iter 160 loglikelihood = -3260744.192
## INFO [2021-05-11 18:13:15] iter 170 loglikelihood = -3258841.171
## INFO [2021-05-11 18:13:15] iter 180 loglikelihood = -3257687.748
## INFO [2021-05-11 18:13:15] iter 190 loglikelihood = -3256901.227
## INFO [2021-05-11 18:13:16] iter 200 loglikelihood = -3255356.138
## INFO [2021-05-11 18:13:16] iter 210 loglikelihood = -3254382.159
## INFO [2021-05-11 18:13:17] iter 220 loglikelihood = -3253235.947
## INFO [2021-05-11 18:13:17] iter 230 loglikelihood = -3253197.343
## INFO [2021-05-11 18:13:17] iter 240 loglikelihood = -3252508.949
## INFO [2021-05-11 18:13:18] iter 250 loglikelihood = -3251733.676
## INFO [2021-05-11 18:13:18] iter 260 loglikelihood = -3253199.008
## INFO [2021-05-11 18:13:18] early stopping at 260 iteration

這個比topicmodels的package跑快超多倍

一樣可以用LDAvis的套件來看

lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字

##       [,1]   [,2]       [,3]   [,4]  
##  [1,] "病毒" "疫苗"     "接種" "疫苗"
##  [2,] "疫苗" "指揮中心" "確診" "中國"
##  [3,] "試驗" "施打"     "疫苗" "國家"
##  [4,] "研究" "採購"     "死亡" "公司"
##  [5,] "臨床" "整理"     "新增" "全球"
##  [6,] "變種" "醫院"     "病例" "輝瑞"
##  [7,] "保護" "az"       "血栓" "歐盟"
##  [8,] "可能" "中央"     "出現" "生產"
##  [9,] "免疫" "台灣"     "累計" "藥廠"
## [10,] "感染" "時間"     "香港" "美國"

lda_model$plot()

## Loading required namespace: servr

# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)

使用主題模型分析新冠肺炎疫苗中文新聞資料

王品堯

2021/05/11

Ch0.套件取得及資料載入

套件

資料描述

Ch1. Document Term Matrix (DTM)

資料前處理

將資料轉換為Document Term Matrix (DTM)

ch2. 主題模型

建立LDA模型

利用LDA模型建立phi矩陣

尋找Topic的代表字

建立更多主題的主題模型

透過perplexity找到最佳主題數

補充 ldatuning

產生LDAvis結果

產生LDAvis檔案，存至local端

ch3. LDA分析

選定4個主題數的主題模型

主題命名

Document 主題分佈

查看特定主題的文章

了解主題在時間的變化

去除筆數少月份

以比例了解主題時間變化

補充 - 不同訓練LDA模型套件

建立DTM matrix

LDA 模型

一樣可以用LDAvis的套件來看