ch0.套件取得及資料載入

套件

library(data.table)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(jiebaR)

## Loading required package: jiebaRD

library(tidytext)
library(stringr)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(topicmodels)
library(purrr)

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:data.table':
## 
##     transpose

require(RColorBrewer)

## Loading required package: RColorBrewer

mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

資料描述

近兩日，臺灣新冠肺炎的本土個案激增，因此希望透過整理這兩天在PTT版上的文章能得到造成本土個案激增可能的原因或事件，並了解網友普遍對於確診個案激增的想法，在這次作業，我們透過中山管院文字分析平台，載入PTT Gossiping的文章，搜尋關鍵字為「本土、個案、肺炎」，時間從2021/5/14到2021/05/16。

metadata <- fread("covidtw_articleMetaData.csv", encoding = "UTF-8")

從圖中可得知，因為在5/15號時臺灣突然有180例本土個案，因此討論量也激增，在之前可能討論的普遍都是與疫情相關的其他資訊(如新聞轉發、疫苗等)

metadata %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    labs(x = "日期", y = "文章數")+
    geom_point()

Ch1. Document Term Matrix (DTM)

資料前處理

使用默認參數初始化一個斷詞引擎

jieba_tokenizer = worker(stop_word = "stopwords.txt")
news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

計算每篇文章各token出現次數

tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter((!str_detect(word, regex("[0-9a-zA-Z]"))) | str_detect(word, regex("[Aa][Zz]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens %>% head(20)

##                                                       artUrl     word count
##  1: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     入住     1
##  2: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     大眾     1
##  3: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     已經     5
##  4: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     不進     1
##  5: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     中央     1
##  6: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     分已     1
##  7: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     升溫     1
##  8: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     出院     1
##  9: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     出現     2
## 10: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     只出     1
## 11: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     台北     4
## 12: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html   台北市     2
## 13: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 台北市立     2
## 14: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     市民     3
## 15: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     市長     2
## 16: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     目前     3
## 17: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     休息     1
## 18: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     全面     1
## 19: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     共同     1
## 20: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html     共識     1

將資料轉換為Document Term Matrix (DTM)

dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm

## <<DocumentTermMatrix (documents: 309, terms: 6705)>>
## Non-/sparse entries: 20031/2051814
## Sparsity           : 99%
## Maximal term length: 6
## Weighting          : term frequency (tf)

inspect(dtm[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 15/85
## Sparsity           : 85%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## Sample             :
##                                                           Terms
## Docs                                                       入住 大眾 已經 不進
##   https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html    1    1    5    1
##   https://www.ptt.cc/bbs/Gossiping/M.1620922760.A.686.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620923292.A.6BC.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620926621.A.8D7.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620927085.A.BED.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620927318.A.73B.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620930015.A.74C.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620949318.A.C46.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620950513.A.178.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620951121.A.1B5.html    0    0    0    0
##                                                           Terms
## Docs                                                       中央 分已 升溫 出院
##   https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html    1    1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1620922760.A.686.html    0    0    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620923292.A.6BC.html    2    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620926621.A.8D7.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620927085.A.BED.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620927318.A.73B.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620930015.A.74C.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620949318.A.C46.html    3    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620950513.A.178.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620951121.A.1B5.html    0    0    0    0
##                                                           Terms
## Docs                                                       出現 只出
##   https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html    2    1
##   https://www.ptt.cc/bbs/Gossiping/M.1620922760.A.686.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620923292.A.6BC.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620926621.A.8D7.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620927085.A.BED.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620927318.A.73B.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620930015.A.74C.html    3    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620949318.A.C46.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620950513.A.178.html    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620951121.A.1B5.html    0    0

ch2. 主題模型

建立LDA模型

lda <- LDA(dtm, k = 2, control = list(seed = 2021))
# lda <- LDA(dtm, k = 2, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs") #調整alpha即delta
lda

## A LDA_VEM topic model with 2 topics.

利用LDA模型建立phi矩陣

topics_words <- tidy(lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words

## # A tibble: 13,410 x 3
##    topic term       phi
##    <int> <chr>    <dbl>
##  1     1 入住  1.09e-10
##  2     2 入住  1.30e- 4
##  3     1 大眾  3.64e- 4
##  4     2 大眾  1.54e- 4
##  5     1 已經  3.18e- 3
##  6     2 已經  2.56e- 3
##  7     1 不進  2.18e-10
##  8     2 不進  1.30e- 4
##  9     1 中央  3.11e- 4
## 10     2 中央  4.08e- 3
## # ... with 13,400 more rows

尋找Topic的代表字

terms依照各主題的phi值由大到小排序，列出前10大

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

#ch3. 尋找最佳主題數

建立更多主題的主題模型

嘗試2、4、6、10、15個主題數，將結果存起來，再做進一步分析。此部分需要跑一段時間，已經將跑完的檔案存成ldas_result.rdata，可以直接載入

# ldas = c()
# topics = c(2,4,6,10,15)
# for(topic in topics){
#   start_time <- Sys.time()
#   lda <- LDA(dtm, k = topic, control = list(seed = 2021))
#   ldas =c(ldas,lda)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(ldas,file = "covidtw_result.rdata") # 將模型輸出成檔案
# }

載入每個主題的LDA結果

load("covidtw_result.rdata")

透過perplexity找到最佳主題數

topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

create LDAvis所需的json function 此function是將前面使用 “LDA function”所建立的model，轉換為“LDAVis”套件的input格式。

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決，主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}

產生LDAvis結果

the_lda = ldas[[2]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
serVis(json_res,open.browser = T)

產生LDAvis檔案，存至local端

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

ch4. LDA分析

選定4個主題數的主題模型

the_lda = ldas[[2]] ## 選定topic 為 4 的結果

topics_words <- tidy(the_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)

## # A tibble: 10 x 3
##    topic term     phi
##    <int> <chr>  <dbl>
##  1     3 確診  0.0285
##  2     4 確診  0.0271
##  3     1 確診  0.0259
##  4     3 完整  0.0153
##  5     2 確診  0.0141
##  6     3 新聞  0.0136
##  7     2 個案  0.0133
##  8     3 疫情  0.0123
##  9     4 本土  0.0106
## 10     1 台灣  0.0106

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

去除部分詞彙：

removed_word = c("確診","本土","記者","備註","表示","所以","相關","疫情","記者","就是","現在","今天","什麼")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

主題命名

topics_name = c("相關事件","指揮中心","個案足跡","臺灣民情")

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)

現在我們看每一篇的文章分佈了！

查看特定主題的文章

透過找到特定文章的分佈進行排序之後，可以看到此主題的比重高的文章在討論什麼。

news_topic %>%
  arrange(desc("個案足跡")) %>% head(10)

了解主題在時間的變化

variable.names((2))

## NULL

news_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m%d')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors[c(1,5,8,12)])+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

以比例了解主題時間變化

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  group_by(artDate = format(artDate,'%Y%m%d')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
    scale_fill_manual(values=mycolors[c(1,5,8,12)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

補充 - 不同訓練LDA模型套件

參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation

library(text2vec)

## 
## Attaching package: 'text2vec'

## The following object is masked from 'package:topicmodels':
## 
##     perplexity

library(udpipe)
tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))

建立DTM matrix

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)

## [1] 309 133

LDA 模型

set.seed(2019)
  
topic_n = 4

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)

## INFO  [20:32:14.566] early stopping at 110 iteration 
## INFO  [20:32:14.680] early stopping at 50 iteration

這個比topicmodels的package跑快超多倍

一樣可以用LDAvis的套件來看

lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字

##       [,1]   [,2]     [,3]   [,4]      
##  [1,] "肺炎" "確診"   "隔離" "個案"    
##  [2,] "台大" "現在"   "防疫" "疫情"    
##  [3,] "醫院" "知道"   "確診" "完整"    
##  [4,] "感染" "今天"   "消毒" "指揮中心"
##  [5,] "萬華" "這樣"   "進行" "本土"    
##  [6,] "國籍" "如果"   "學生" "其中"    
##  [7,] "武漢" "就是"   "通知" "新聞"    
##  [8,] "北部" "什麼"   "來源" "新增"    
##  [9,] "應該" "是不是" "居家" "群聚"    
## [10,] "開始" "有沒有" "台北" "網址"

lda_model$plot()

## Loading required namespace: servr

# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)

使用主題模型分析近日臺灣新冠肺炎疫情

讀書會第六組：鍾秉恩、曾靖翔、徐湛、林幸佳、黃子芹

2021/5/16

ch0.套件取得及資料載入

套件

資料描述

Ch1. Document Term Matrix (DTM)

資料前處理

將資料轉換為Document Term Matrix (DTM)

ch2. 主題模型

建立LDA模型

利用LDA模型建立phi矩陣

尋找Topic的代表字

建立更多主題的主題模型

透過perplexity找到最佳主題數

產生LDAvis結果

產生LDAvis檔案，存至local端

ch4. LDA分析

選定4個主題數的主題模型

主題命名

Document 主題分佈

查看特定主題的文章

了解主題在時間的變化

以比例了解主題時間變化

補充 - 不同訓練LDA模型套件

建立DTM matrix

LDA 模型

一樣可以用LDAvis的套件來看