資料基本介紹

ch0.套件取得及資料載入

套件

library(data.table)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(tidytext)
library(stringr)
library(tm)
## Warning: package 'tm' was built under R version 4.0.5
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 4.0.5
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
## 
##     transpose
require(RColorBrewer)
## Loading required package: RColorBrewer
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
packages = c("jiebaR", "tm", "tmcn", "dplyr", "tidytext", "ggplot2", "wordcloud2", "rvest","stringr","dygraphs","knitr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

資料描述

透過中山管院文字分析平台,載入PPT八卦版資料,搜尋關鍵字為「停電」,時間從2020/05/11到2021/05/15。

metadata <- fread("news_articleMetaData.csv", encoding = "UTF-8")

停電相關討論在5月13日當天數量急遽上升

metadata %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()

plottext <- metadata %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())
plottext
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(dygraphs)
## Warning: package 'dygraphs' was built under R version 4.0.5
date.ts1 <- data.frame(zoo(plottext[,-1],plottext$artDate))
dygraph(date.ts1,main = "關於停電在 ptt Gossiping 版發文數") %>%
  dySeries("count", label = "發文數量", col='   #FF2D2D') %>% 
  dyRangeSelector() 

第一張圖看不出11和12日發文數量,由此圖可以看出更詳細的發文數

Ch1. Document Term Matrix (DTM)

資料前處理

使用默認參數初始化一個斷詞引擎

jieba_tokenizer = worker()
news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

計算每篇文章各token出現次數

tokens <- metadata %>%
  unnest_tokens(word, artContent, token=news_tokenizer) %>%
  filter((!str_detect(word, regex("[0-9a-zA-Z]"))) | str_detect(word, regex("[Aa][Zz]"))) %>%
  filter(!(word %in% c("有沒有", "什麼", "可以", "今天", "現在","是不是","這樣", "沒有","就是","所以","不是","一個","我們","大家","怎麼","還是"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens %>% head(20)

將資料轉換為Document Term Matrix (DTM)

查看DTM矩陣,可以發現是個稀疏矩陣。

dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm
## <<DocumentTermMatrix (documents: 1148, terms: 12138)>>
## Non-/sparse entries: 45641/13888783
## Sparsity           : 100%
## Maximal term length: 26
## Weighting          : term frequency (tf)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 20/80
## Sparsity           : 80%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##                                                           Terms
## Docs                                                       一片 了嗎 之地 內湖
##   https://www.ptt.cc/bbs/Gossiping/M.1620663828.A.D21.html    1    1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1620665267.A.2DE.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620700271.A.29A.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620701392.A.0B2.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620701583.A.DC8.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620784163.A.AB4.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620784167.A.289.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620800155.A.BE3.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620826960.A.79B.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620888538.A.035.html    0    0    0    0
##                                                           Terms
## Docs                                                       化外 本以 有人
##   https://www.ptt.cc/bbs/Gossiping/M.1620663828.A.D21.html    1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1620665267.A.2DE.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620700271.A.29A.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620701392.A.0B2.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620701583.A.DC8.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620784163.A.AB4.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620784167.A.289.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620800155.A.BE3.html    0    0    1
##   https://www.ptt.cc/bbs/Gossiping/M.1620826960.A.79B.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620888538.A.035.html    0    0    0
##                                                           Terms
## Docs                                                       沒想到 停電 這邊
##   https://www.ptt.cc/bbs/Gossiping/M.1620663828.A.D21.html      1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1620665267.A.2DE.html      0    2    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620700271.A.29A.html      0    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620701392.A.0B2.html      0    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620701583.A.DC8.html      0    2    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620784163.A.AB4.html      1    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620784167.A.289.html      0    3    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620800155.A.BE3.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620826960.A.79B.html      0    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1620888538.A.035.html      0    2    0

ch2. 主題模型

建立LDA模型

lda <- LDA(dtm, k = 2, control = list(seed = 2021))
# lda <- LDA(dtm, k = 2, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs") #調整alpha即delta
lda
## A LDA_VEM topic model with 2 topics.

利用LDA模型建立phi矩陣

topics_words <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words

尋找Topic的代表字

terms依照各主題的phi值由大到小排序,列出前10大

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

從上圖來看發現Topic 1主要以台灣現況做區分,Topic 2則為停電原因做討論

#ch3. 尋找最佳主題數

建立更多主題的主題模型

嘗試2、4、6、10、15個主題數,將結果存起來,再做進一步分析。 此部分需要跑一段時間,已經將跑完的檔案存成ldas_result.rdata,可以直接載入

ldas = c()
topics = c(2,4,6,10,15)
for(topic in topics){
  start_time <- Sys.time()
  lda <- LDA(dtm, k = topic, control = list(seed = 2021))
  ldas =c(ldas,lda)
  print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
  save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
}

載入每個主題的LDA結果

load("ldas_result.rdata")

透過perplexity找到最佳主題數

topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

perplexity 越小越好,但是太小的話,主題數會分太細。

create LDAvis所需的json function 此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決,主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}

產生LDAvis結果

# install.packages('servr') 
the_lda = ldas[[2]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
serVis(json_res,open.browser = T)

產生LDAvis檔案,存至local端

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

ch4. LDA分析

選定4個主題數的主題模型

the_lda = ldas[[2]] ## 選定topic 為 4 的結果
topics_words <- tidy(the_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

去除共通詞彙

removed_word = c("停電", "台灣")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

主題命名

topics_name = c("台灣現況","台電電網系統危機","政府處理問題","興達電廠電網事故")

從查看特定文章發現以下敘述: Topic1為台灣最近遭遇缺水、疫情及缺電情況並抱怨這天氣停電沒冷氣根本世界末日; Topic2討論電廠跳電質疑台電備載容量不足; Topic3主要描述政府對於停電與疫情問題的相關比較; Topic4為興達電廠電網事故造成部分機組停機,台電緊急搶修進而展開分區輪流限電。

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)

現在我們看每一篇的文章分佈了!

查看特定主題的文章

  • 透過找到特定文章的分佈進行排序之後,可以看到此主題的比重高的文章在討論什麼。
news_topic %>%
  arrange(desc('興達電廠電網事故')) %>%head(10) 
news_topic <- dplyr::select(news_topic, -(system_id))

了解主題在時間的變化

news_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  # filter( !format(artDate,'%m%d') %in% 0512) 去除筆數較少的天數
  group_by(artDate = format(artDate,'%m%d')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors[c(1,5,8,12)])+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

由圖中顯示5月13日停電當天網友紛紛在ptt上回報相關停電災情,以台灣現況為主題廣為討論,內容都以疫情期間出現停水及停電相關

以比例了解主題時間變化

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% 0512)%>%
  group_by(artDate = format(artDate,'%m%d')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
    scale_fill_manual(values=mycolors[c(1,5,8,12)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

補充 - 不同訓練LDA模型套件

參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation

library(text2vec)
## Warning: package 'text2vec' was built under R version 4.0.5
## 
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
## 
##     perplexity
library(udpipe)
## Warning: package 'udpipe' was built under R version 4.0.5
tokens <- metadata %>%
  unnest_tokens(word, artContent, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))

建立DTM matrix

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)
## [1] 1147  335

LDA 模型

set.seed(2019)

topic_n = 4

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)
## INFO  [22:21:46.147] early stopping at 140 iteration 
## INFO  [22:21:46.310] early stopping at 50 iteration

這個比topicmodels的package跑快超多倍

一樣可以用LDAvis的套件來看

lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字
##       [,1]   [,2]     [,3]     [,4]    
##  [1,] "停電" "完整"   "停電"   "電網"  
##  [2,] "我們" "供電"   "疫情"   "容量"  
##  [3,] "沒有" "表示"   "台灣"   "電廠"  
##  [4,] "真的" "新聞"   "缺水"   "問題"  
##  [5,] "時候" "興達"   "是不是" "備轉"  
##  [6,] "政府" "記者"   "現在"   "電力"  
##  [7,] "什麼" "緊急"   "疫苗"   "發電"  
##  [8,] "不能" "分區"   "防疫"   "機組"  
##  [9,] "就是" "事故"   "八卦"   "發電廠"
## [10,] "大家" "匯流排" "今天"   "太陽能"
lda_model$plot()
## Loading required namespace: servr
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)

結論

本組從主題模型來分析大家對於停電所圍繞的主題為何,得出由疫情加劇期間台灣所發生的停水、停電近況到停電事故緣由及台電緊急搶修應對到最後政府應如何解決此次問題,從LDAvis看出topic 1 討論頻率較高且調節λ參數發現topic 1 為“政府”、“停電”;topic 2 為“興達”、“供電”;topic 3 為“疫情”、“缺水”;topic 4 為“容量”、“備轉”也與我們上述命名相符。