1. 套件取得及資料載入

套件載入

library(data.table)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(tidytext)
library(stringr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(topicmodels)
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
## 
##     transpose
require(RColorBrewer)
## Loading required package: RColorBrewer

資料描述

  • 資料來源: 文字平台收集PTT Gossip、MRT、台中版 2020-05-01 ~ 2021-05-15 所有文章與留言
  • 資料集: yen_mrt1_articleMetaData.csv 、yen_mrt2_articleMetaData.csv 、yen_mrt3_articleMetaData.csv
  • 關鍵字:台中捷運、中捷
  • 資料時間:2020-05-01 ~ 2021-05-15
  • 文章篇數:總共 480 篇
# 將三個版的資料合併
MetaData1 = fread('yen_mrt1_articleMetaData.csv',encoding = 'UTF-8')
MetaData2 = fread('yen_mrt2_articleMetaData.csv',encoding = 'UTF-8')
MetaData3 = fread('yen_mrt3_articleMetaData.csv',encoding = 'UTF-8')
Reviews1  = fread('yen_mrt1_articleReviews.csv',encoding = 'UTF-8')
Reviews2  = fread('yen_mrt2_articleReviews.csv',encoding = 'UTF-8')
Reviews3  = fread('yen_mrt3_articleReviews.csv',encoding = 'UTF-8')

MetaData = rbind(MetaData1, MetaData2, MetaData3)
Reviews  = rbind(Reviews1, Reviews2, Reviews3)

# 再篩一次文章 467 篇
keywords = c('台中捷運', '中捷')
toMatch = paste(keywords,collapse="|")
MetaData = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])

# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")

查看各時段的文章討論數量變化

MetaData %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()

可以看到以下三個時間點在ptt版上的討論明顯數量增加:

  • 2020/11-2020/12月之間
  • 2021/03月中旬
  • 2021/04月中下旬

2. Document Term Matrix (DTM)

資料前處理

使用默認參數初始化一個斷詞引擎

jieba_tokenizer = worker(user="../dict/user_dict.txt", stop_word = "dict/stop_words.txt")
ptt_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
# 把文章和留言的斷詞結果併在一起
#MToken <- MetaData %>% unnest_tokens(word, sentence, token=ptt_tokenizer)
#RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=ptt_tokenizer)

# 把資料併在一起
#data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 

計算每篇文章各token出現次數

tokens <- MetaData %>%
  unnest_tokens(word, sentence, token=ptt_tokenizer) %>%
  filter((!str_detect(word, regex("[0-9a-zA-Z]"))) | str_detect(word, regex("[Aa][Zz]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens %>% head(20)
##                                                       artUrl     word count
##  1: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     報導     1
##  2: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     備註     2
##  3: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     變革     1
##  4: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     變化     1
##  5: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     不變     1
##  6: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     不依     1
##  7: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     部份     5
##  8: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     車站     1
##  9: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     出現     1
## 10: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     串連     2
## 11: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     此段     1
## 12: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     大雅     1
## 13: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     地下    10
## 14: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 東海大學     1
## 15: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     二任     1
## 16: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     發文     1
## 17: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     發展     1
## 18: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     方向     1
## 19: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     高架     2
## 20: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html     格式     1

將資料轉換為Document Term Matrix (DTM)

dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm
## <<DocumentTermMatrix (documents: 480, terms: 11067)>>
## Non-/sparse entries: 49447/5262713
## Sparsity           : 99%
## Maximal term length: 30
## Weighting          : term frequency (tf)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 17/83
## Sparsity           : 83%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## Sample             :
##                                                           Terms
## Docs                                                       報導 備註 變革 變化
##   https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html    1    2    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1588559009.A.670.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588562255.A.77E.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588569392.A.358.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588582698.A.3E5.html    0    1    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588990735.A.4C6.html    1    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588991252.A.EB0.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1589116258.A.627.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1590129143.A.C2C.html    1    1    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1590133343.A.03A.html    0    0    0    0
##                                                           Terms
## Docs                                                       不變 不依 部份 車站
##   https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html    1    1    5    1
##   https://www.ptt.cc/bbs/Gossiping/M.1588559009.A.670.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588562255.A.77E.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588569392.A.358.html    0    0    0    1
##   https://www.ptt.cc/bbs/Gossiping/M.1588582698.A.3E5.html    0    0    2    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588990735.A.4C6.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588991252.A.EB0.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1589116258.A.627.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1590129143.A.C2C.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1590133343.A.03A.html    0    0    0    0
##                                                           Terms
## Docs                                                       出現 串連
##   https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html    1    2
##   https://www.ptt.cc/bbs/Gossiping/M.1588559009.A.670.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588562255.A.77E.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588569392.A.358.html    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588582698.A.3E5.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588990735.A.4C6.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1588991252.A.EB0.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1589116258.A.627.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1590129143.A.C2C.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1590133343.A.03A.html    0    0

3. 主題模型

建立LDA模型

lda <- LDA(dtm, k = 2, control = list(seed = 2021))
# lda <- LDA(dtm, k = 2, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs") #調整alpha即delta
lda
## A LDA_VEM topic model with 2 topics.

利用LDA模型建立phi矩陣

topics_words <- tidy(lda, matrix = "beta") #注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words
## # A tibble: 22,134 x 3
##    topic term       phi
##    <int> <chr>    <dbl>
##  1     1 報導  3.42e- 3
##  2     2 報導  1.98e- 3
##  3     1 備註  2.64e- 3
##  4     2 備註  8.33e- 4
##  5     1 變革  1.72e-59
##  6     2 變革  7.80e- 5
##  7     1 變化  1.40e- 4
##  8     2 變化  1.56e- 4
##  9     1 不變  2.81e- 5
## 10     2 不變  7.80e- 5
## # … with 22,124 more rows

尋找Topic的代表字

terms依照各主題的phi值由大到小排序,列出前10大代表字

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

4. 尋找最佳主題數

建立更多主題的主題模型

嘗試2、4、6、10、15個主題數,將結果存起來,再做進一步分析。
(此部分需要跑一段時間,已經將跑完的檔案存成ldas_result.rdata,可以直接載入)

#ldas = c()
#topics = c(2,4,6,10,15)
#for(topic in topics){
#  start_time <- Sys.time()
#  lda <- LDA(dtm, k = topic, control = list(seed = 2021))
#  ldas =c(ldas,lda)
#  print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#  save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
#}

載入每個主題的LDA結果

load("ldas_result.rdata")

透過perplexity找到最佳主題數

topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

  • 主題數越多,複雜度越低,內容的純度越高。
  • 可以挑選下降幅度減緩的點。

產生LDAvis結果

create LDAvis所需的json function
此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決,主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}
the_lda = ldas[[2]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
serVis(json_res,open.browser = T)

  • 圓的大小代表有多少documents
  • 圓如果分越開,則表示每個主題都有獨特性。
  • Slide to adjust relevance metric的部分,紅色代表phi,且λ調越小則越可以看出每個字的唯一性。

產生LDAvis檔案,存至local端

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

從LDAvis分析結果中可以初度得知這四個主題的討論方向:

  1. 「運價」、「票價」、「站名」、「高捷」等字詞,推測是討論台中捷運基本營運規劃的相關事項。
  2. 「藍線」、「經費」、「林佳龍」等字詞,推測是討論中捷藍線經費相關議題。
  3. 「軸心」、「故障」、「斷裂」、「川崎」等字詞,推測是討論中捷的斷軌事件。
  4. 「改善」、「履勘」、「委員」、「初勘」等字詞,推測是討論台中捷運試營運的相關經驗與建議。

5. LDA分析

選定4個主題數的主題模型

the_lda = ldas[[2]] ## 選定topic 為 4 的結果
topics_words <- tidy(the_lda, matrix = "beta") #注意!在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)
## # A tibble: 10 x 3
##    topic term     phi
##    <int> <chr>  <dbl>
##  1     1 捷運  0.0370
##  2     3 捷運  0.0353
##  3     2 捷運  0.0350
##  4     2 營運  0.0219
##  5     4 捷運  0.0198
##  6     2 台中  0.0188
##  7     2 綠線  0.0177
##  8     4 軸心  0.0171
##  9     4 中捷  0.0167
## 10     2 通車  0.0157

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

去除共通詞彙

e.g., 捷運、台中、中捷等等

removed_word = c("捷運","台中","中捷","可以","表示","台中市","沒有","真的","交通","交通部")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

主題命名

topics_name = c("台中捷運通車","綠線11月試營運","藍線經費暴增爭議","台中捷運斷軌事件")

透過上述字詞,可將其分為以下主題:

  1. Topic為「通車」、「公車」等等字詞,因此可推得第一張圖的主題為台中捷運通車(2021/04/25)
  2. Topic為「試營運」、「綠線」等等字詞,因此可推得第二張圖的主題為綠線試營運,另外,由於中捷的綠線是沿著文心路到台中高鐵站的,所以第二張圖的主題字也出現「高鐵」。(2020/11/16)
  3. Topic為「藍線」、「市府」等等字詞,由於中捷藍線經費跟預算暴增,因此第三張圖也出現關於「運量」等相關討論,可推得第三張圖的主題為藍線經費暴增爭議。(2021/03)
  4. Topic為「軸心」、「故障」、「斷裂」等等字詞,因此可以推得第四張圖的主題為台中捷運斷軌事件。(2020/11/21)

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[MetaData$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
ptt_topic = cbind(MetaData,document_topics_df)

# 刪除commentNum、push、boo欄位
ptt_topic$commentNum = NULL
ptt_topic$push = NULL
ptt_topic$boo = NULL

查看特定主題的文章

透過找到特定文章的分佈進行排序之後,可以看到此主題的比重高的文章在討論什麼。

ptt_topic %>%
  arrange(desc(`台中捷運通車`)) %>%head(10) 

了解主題在時間的變化

ptt_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  #scale_fill_manual(values=c("#FFD449","#80AB82","#2F6690","#EF8354"))+
  scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374"))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

去除筆數少月份

ptt_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202005, 202006, 202007, 202008, 202009, 202010, 202101, 202102, 202105))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374"))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

從這張圖中可以看到,對應到前面的文章討論數量分佈圖,討論數量較高的月份剛好是「2020/11-2020/12月之間」、「2021/03月中旬」、「2021/04月中下旬」這些時間點。

其中,從圖中的主題分佈可以推得:

  • 2020/11-2020/12這段時間對於中捷的討論主題為試營運通車(2020/11/16)之後,斷軌事件(2020/11/21)發生的事件。
  • 2021/03這段時間主要的討論主題在於捷運藍線經費暴增的爭議。
  • 2021/04這段時間除了藍線議題持續被討論以外,中捷在4/25正式通車之後也備受關注。

以比例了解主題時間變化

ptt_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( !format(artDate,'%Y%m') %in% c(202005, 202006, 202007, 202008, 202009, 202010, 202101, 202102, 202105))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
  scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374"))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

從這張比例圖可以驗證上述的推論:

  • 2020/11的討論主題為試營運通車以及斷軌事件。
  • 2020/12斷軌事件的討論持續延燒。
  • 2021/03的討論主題為藍線經費暴增的爭議。
  • 2021/04由於正式通車,因此討論度較高。

6. 不同訓練LDA模型套件

參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation

library(text2vec)
## 
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
## 
##     perplexity
library(udpipe)
tokens <- MetaData %>%
  unnest_tokens(word, sentence, token=ptt_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))

建立DTM matrix

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)
## [1] 478 432

LDA 模型

set.seed(2019)

topic_n = 4

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)
## INFO  [12:31:22.506] early stopping at 160 iteration 
## INFO  [12:31:22.613] early stopping at 30 iteration

與上述topicmodels的package的結果相比較:

  • 這個相較於topicmodels的package來說,分出來的主題數如果一樣是4個的話,會有重疊的主題。
  • 主題1跟主題3會重疊的原因可能是主題1的字詞包含「求償」等字眼,因此和主題3在討論斷軌事件的主題相似,而有所重複。
  • 另外,我們也發現透過這個套件斷的詞有一些斷得比較不好的問題,像是「柯文哲」只有被斷成「柯文」等等。

將主題數設為3來處理,會發現這樣比較合理,且每個字詞的獨特性都比較高。

一樣可以用LDAvis的套件來看

lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字
##       [,1]     [,2]     [,3]   [,4]  
##  [1,] "軸心"   "通車"   "改善" "藍線"
##  [2,] "故障"   "市長"   "營運" "規劃"
##  [3,] "川崎"   "盧秀燕" "履勘" "路線"
##  [4,] "斷裂"   "完整"   "高鐵" "文心"
##  [5,] "重工"   "新聞"   "北屯" "公車"
##  [6,] "廠商"   "市民"   "旅客" "經費"
##  [7,] "北捷局" "記者"   "車站" "高架"
##  [8,] "安全"   "綠線"   "綠線" "延伸"
##  [9,] "檢測"   "柯文"   "人次" "地下"
## [10,] "列車"   "台北"   "委員" "路網"
lda_model$plot()
## Loading required namespace: servr
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)