library(data.table)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(tidytext)
library(stringr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(topicmodels)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
require(RColorBrewer)
## Loading required package: RColorBrewer
# 將三個版的資料合併
= fread('yen_mrt1_articleMetaData.csv',encoding = 'UTF-8')
MetaData1 = fread('yen_mrt2_articleMetaData.csv',encoding = 'UTF-8')
MetaData2 = fread('yen_mrt3_articleMetaData.csv',encoding = 'UTF-8')
MetaData3 = fread('yen_mrt1_articleReviews.csv',encoding = 'UTF-8')
Reviews1 = fread('yen_mrt2_articleReviews.csv',encoding = 'UTF-8')
Reviews2 = fread('yen_mrt3_articleReviews.csv',encoding = 'UTF-8')
Reviews3
= rbind(MetaData1, MetaData2, MetaData3)
MetaData = rbind(Reviews1, Reviews2, Reviews3)
Reviews
# 再篩一次文章 467 篇
= c('台中捷運', '中捷')
keywords = paste(keywords,collapse="|")
toMatch = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])
MetaData
# 挑選文章對應的留言
= left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl") Reviews
%>%
MetaData mutate(artDate = as.Date(artDate)) %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="red")+
geom_point()
可以看到以下三個時間點在ptt版上的討論明顯數量增加:
使用默認參數初始化一個斷詞引擎
= worker(user="../dict/user_dict.txt", stop_word = "dict/stop_words.txt")
jieba_tokenizer <- function(t) {
ptt_tokenizer lapply(t, function(x) {
if(nchar(x)>1){
<- segment(x, jieba_tokenizer)
tokens # 去掉字串長度爲1的詞彙
<- tokens[nchar(tokens)>1]
tokens return(tokens)
}
}) }
# 把文章和留言的斷詞結果併在一起
#MToken <- MetaData %>% unnest_tokens(word, sentence, token=ptt_tokenizer)
#RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=ptt_tokenizer)
# 把資料併在一起
#data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")])
計算每篇文章各token出現次數
<- MetaData %>%
tokens unnest_tokens(word, sentence, token=ptt_tokenizer) %>%
filter((!str_detect(word, regex("[0-9a-zA-Z]"))) | str_detect(word, regex("[Aa][Zz]"))) %>%
count(artUrl, word) %>%
rename(count=n)
%>% head(20) tokens
## artUrl word count
## 1: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 報導 1
## 2: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 備註 2
## 3: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 變革 1
## 4: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 變化 1
## 5: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 不變 1
## 6: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 不依 1
## 7: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 部份 5
## 8: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 車站 1
## 9: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 出現 1
## 10: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 串連 2
## 11: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 此段 1
## 12: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 大雅 1
## 13: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 地下 10
## 14: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 東海大學 1
## 15: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 二任 1
## 16: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 發文 1
## 17: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 發展 1
## 18: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 方向 1
## 19: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 高架 2
## 20: https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 格式 1
<-tokens %>% cast_dtm(artUrl, word, count)
dtm dtm
## <<DocumentTermMatrix (documents: 480, terms: 11067)>>
## Non-/sparse entries: 49447/5262713
## Sparsity : 99%
## Maximal term length: 30
## Weighting : term frequency (tf)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 17/83
## Sparsity : 83%
## Maximal term length: 2
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 報導 備註 變革 變化
## https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 1 2 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1588559009.A.670.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588562255.A.77E.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588569392.A.358.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588582698.A.3E5.html 0 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588990735.A.4C6.html 1 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588991252.A.EB0.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1589116258.A.627.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1590129143.A.C2C.html 1 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1590133343.A.03A.html 0 0 0 0
## Terms
## Docs 不變 不依 部份 車站
## https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 1 1 5 1
## https://www.ptt.cc/bbs/Gossiping/M.1588559009.A.670.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588562255.A.77E.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588569392.A.358.html 0 0 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1588582698.A.3E5.html 0 0 2 0
## https://www.ptt.cc/bbs/Gossiping/M.1588990735.A.4C6.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588991252.A.EB0.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1589116258.A.627.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1590129143.A.C2C.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1590133343.A.03A.html 0 0 0 0
## Terms
## Docs 出現 串連
## https://www.ptt.cc/bbs/Gossiping/M.1588555751.A.3F0.html 1 2
## https://www.ptt.cc/bbs/Gossiping/M.1588559009.A.670.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588562255.A.77E.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588569392.A.358.html 1 0
## https://www.ptt.cc/bbs/Gossiping/M.1588582698.A.3E5.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588990735.A.4C6.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1588991252.A.EB0.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1589116258.A.627.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1590129143.A.C2C.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1590133343.A.03A.html 0 0
<- LDA(dtm, k = 2, control = list(seed = 2021))
lda # lda <- LDA(dtm, k = 2, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs") #調整alpha即delta
lda
## A LDA_VEM topic model with 2 topics.
<- tidy(lda, matrix = "beta") #注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics_words colnames(topics_words) <- c("topic", "term", "phi")
topics_words
## # A tibble: 22,134 x 3
## topic term phi
## <int> <chr> <dbl>
## 1 1 報導 3.42e- 3
## 2 2 報導 1.98e- 3
## 3 1 備註 2.64e- 3
## 4 2 備註 8.33e- 4
## 5 1 變革 1.72e-59
## 6 2 變革 7.80e- 5
## 7 1 變化 1.40e- 4
## 8 2 變化 1.56e- 4
## 9 1 不變 2.81e- 5
## 10 2 不變 7.80e- 5
## # … with 22,124 more rows
terms依照各主題的phi值由大到小排序,列出前10大代表字
%>%
topics_words group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
mutate(top_words = reorder_within(term,phi,topic)) %>%
ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered() +
theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼
嘗試2、4、6、10、15個主題數,將結果存起來,再做進一步分析。
(此部分需要跑一段時間,已經將跑完的檔案存成ldas_result.rdata,可以直接載入)
#ldas = c()
#topics = c(2,4,6,10,15)
#for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(dtm, k = topic, control = list(seed = 2021))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
#}
載入每個主題的LDA結果
load("ldas_result.rdata")
= c(2,4,6,10,15)
topics data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
create LDAvis所需的json function
此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。
<- function(fitted, doc_term){
topicmodels_json_ldavis require(LDAvis)
require(slam)
###以下function 用來解決,主題數多會出現NA的問題
### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
= function (phi){
ls_LDA <- function(x, y) {
jensenShannon <- 0.5 * (x + y)
m <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
lhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
rhs 0.5 * sum(lhs) + 0.5 * sum(rhs)
}<- proxy::dist(x = phi, method = jensenShannon)
dist.mat <- stats::cmdscale(dist.mat, k = 2)
pca.fit data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}
# Find required quantities
<- as.matrix(posterior(fitted)$terms)
phi <- as.matrix(posterior(fitted)$topics)
theta <- colnames(phi)
vocab <- slam::col_sums(doc_term)
term_freq
# Convert to json
<- LDAvis::createJSON(phi = phi, theta = theta,
json_lda vocab = vocab,
doc.length = as.vector(table(doc_term$i)),
term.frequency = term_freq, mds.method = ls_LDA)
return(json_lda)
}
= ldas[[2]]
the_lda <- topicmodels_json_ldavis(the_lda,dtm)
json_res serVis(json_res,open.browser = T)
serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))
從LDAvis分析結果中可以初度得知這四個主題的討論方向:
= ldas[[2]] ## 選定topic 為 4 的結果 the_lda
<- tidy(the_lda, matrix = "beta") #注意!在tidy function裡面要使用"beta"來取出Phi矩陣。
topics_words colnames(topics_words) <- c("topic", "term", "phi")
%>% arrange(desc(phi)) %>% head(10) topics_words
## # A tibble: 10 x 3
## topic term phi
## <int> <chr> <dbl>
## 1 1 捷運 0.0370
## 2 3 捷運 0.0353
## 3 2 捷運 0.0350
## 4 2 營運 0.0219
## 5 4 捷運 0.0198
## 6 2 台中 0.0188
## 7 2 綠線 0.0177
## 8 4 軸心 0.0171
## 9 4 中捷 0.0167
## 10 2 通車 0.0157
%>%
topics_words group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered() +
theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼
e.g., 捷運、台中、中捷等等
= c("捷運","台中","中捷","可以","表示","台中市","沒有","真的","交通","交通部")
removed_word
%>%
topics_words filter(!term %in% removed_word) %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered() +
theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼
= c("台中捷運通車","綠線11月試營運","藍線經費暴增爭議","台中捷運斷軌事件") topics_name
透過上述字詞,可將其分為以下主題:
# for every document we have a probability distribution of its contained topics
<- posterior(the_lda)
tmResult <- tmResult$topics
doc_pro <- doc_pro[MetaData$artUrl,]
document_topics =data.frame(document_topics)
document_topics_df colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
= cbind(MetaData,document_topics_df)
ptt_topic
# 刪除commentNum、push、boo欄位
$commentNum = NULL
ptt_topic$push = NULL
ptt_topic$boo = NULL ptt_topic
透過找到特定文章的分佈進行排序之後,可以看到此主題的比重高的文章在討論什麼。
%>%
ptt_topic arrange(desc(`台中捷運通車`)) %>%head(10)
%>%
ptt_topic mutate(artDate = as.Date(artDate)) %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
#scale_fill_manual(values=c("#FFD449","#80AB82","#2F6690","#EF8354"))+
scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374"))+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼
%>%
ptt_topic mutate(artDate = as.Date(artDate)) %>%
filter( !format(artDate,'%Y%m') %in% c(202005, 202006, 202007, 202008, 202009, 202010, 202101, 202102, 202105))%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374"))+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼
從這張圖中可以看到,對應到前面的文章討論數量分佈圖,討論數量較高的月份剛好是「2020/11-2020/12月之間」、「2021/03月中旬」、「2021/04月中下旬」這些時間點。
其中,從圖中的主題分佈可以推得:
%>%
ptt_topic mutate(artDate = as.Date(artDate)) %>%
filter( !format(artDate,'%Y%m') %in% c(202005, 202006, 202007, 202008, 202009, 202010, 202101, 202102, 202105))%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374"))+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼
從這張比例圖可以驗證上述的推論:
參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation
library(text2vec)
##
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
##
## perplexity
library(udpipe)
<- MetaData %>%
tokens unnest_tokens(word, sentence, token=ptt_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))
<- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtf <- document_term_matrix(x = dtf)
dtm <- dtm_remove_lowfreq(dtm, minfreq = 30)
dtm_clean dim(dtm_clean)
## [1] 478 432
set.seed(2019)
= 4
topic_n
=text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
lda_model =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100) doc_topic_distr
## INFO [12:31:22.506] early stopping at 160 iteration
## INFO [12:31:22.613] early stopping at 30 iteration
與上述topicmodels的package的結果相比較:
將主題數設為3來處理,會發現這樣比較合理,且每個字詞的獨特性都比較高。
$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字 lda_model
## [,1] [,2] [,3] [,4]
## [1,] "軸心" "通車" "改善" "藍線"
## [2,] "故障" "市長" "營運" "規劃"
## [3,] "川崎" "盧秀燕" "履勘" "路線"
## [4,] "斷裂" "完整" "高鐵" "文心"
## [5,] "重工" "新聞" "北屯" "公車"
## [6,] "廠商" "市民" "旅客" "經費"
## [7,] "北捷局" "記者" "車站" "高架"
## [8,] "安全" "綠線" "綠線" "延伸"
## [9,] "檢測" "柯文" "人次" "地下"
## [10,] "列車" "台北" "委員" "路網"
$plot() lda_model
## Loading required namespace: servr
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)