library(data.table)
library(ggplot2)
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(jiebaR)## Loading required package: jiebaRD
library(tidytext)
library(stringr)
library(tm)## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(topicmodels)
library(purrr)##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
require(RColorBrewer)## Loading required package: RColorBrewer
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)近兩日,臺灣新冠肺炎的本土個案激增,因此希望透過整理這兩天在PTT版上的文章能得到造成本土個案激增可能的原因或事件,並了解網友普遍對於確診個案激增的想法,在這次作業,我們透過中山管院文字分析平台,載入PTT Gossiping的文章,搜尋關鍵字為「本土、個案、肺炎」,時間從2021/5/14到2021/05/16。
metadata <- fread("covidtw_articleMetaData.csv", encoding = "UTF-8")從圖中可得知,因為在5/15號時臺灣突然有180例本土個案,因此討論量也激增,在之前可能討論的普遍都是與疫情相關的其他資訊(如新聞轉發、疫苗等)
metadata %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="red")+
labs(x = "日期", y = "文章數")+
geom_point()使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker(stop_word = "stopwords.txt")
news_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}計算每篇文章各token出現次數
tokens <- metadata %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter((!str_detect(word, regex("[0-9a-zA-Z]"))) | str_detect(word, regex("[Aa][Zz]"))) %>%
count(artUrl, word) %>%
rename(count=n)
tokens %>% head(20)## artUrl word count
## 1: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 入住 1
## 2: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 大眾 1
## 3: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 已經 5
## 4: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 不進 1
## 5: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 中央 1
## 6: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 分已 1
## 7: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 升溫 1
## 8: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 出院 1
## 9: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 出現 2
## 10: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 只出 1
## 11: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 台北 4
## 12: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 台北市 2
## 13: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 台北市立 2
## 14: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 市民 3
## 15: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 市長 2
## 16: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 目前 3
## 17: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 休息 1
## 18: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 全面 1
## 19: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 共同 1
## 20: https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 共識 1
dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm## <<DocumentTermMatrix (documents: 309, terms: 6705)>>
## Non-/sparse entries: 20031/2051814
## Sparsity : 99%
## Maximal term length: 6
## Weighting : term frequency (tf)
inspect(dtm[1:10,1:10])## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 15/85
## Sparsity : 85%
## Maximal term length: 2
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 入住 大眾 已經 不進
## https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 1 1 5 1
## https://www.ptt.cc/bbs/Gossiping/M.1620922760.A.686.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620923292.A.6BC.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620926621.A.8D7.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620927085.A.BED.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620927318.A.73B.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620930015.A.74C.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620949318.A.C46.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620950513.A.178.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620951121.A.1B5.html 0 0 0 0
## Terms
## Docs 中央 分已 升溫 出院
## https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 1 1 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1620922760.A.686.html 0 0 1 0
## https://www.ptt.cc/bbs/Gossiping/M.1620923292.A.6BC.html 2 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620926621.A.8D7.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620927085.A.BED.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620927318.A.73B.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620930015.A.74C.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620949318.A.C46.html 3 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620950513.A.178.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620951121.A.1B5.html 0 0 0 0
## Terms
## Docs 出現 只出
## https://www.ptt.cc/bbs/Gossiping/M.1620921888.A.780.html 2 1
## https://www.ptt.cc/bbs/Gossiping/M.1620922760.A.686.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620923292.A.6BC.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620926621.A.8D7.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620927085.A.BED.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620927318.A.73B.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620930015.A.74C.html 3 0
## https://www.ptt.cc/bbs/Gossiping/M.1620949318.A.C46.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1620950513.A.178.html 1 0
## https://www.ptt.cc/bbs/Gossiping/M.1620951121.A.1B5.html 0 0
lda <- LDA(dtm, k = 2, control = list(seed = 2021))
# lda <- LDA(dtm, k = 2, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs") #調整alpha即delta
lda## A LDA_VEM topic model with 2 topics.
topics_words <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words## # A tibble: 13,410 x 3
## topic term phi
## <int> <chr> <dbl>
## 1 1 入住 1.09e-10
## 2 2 入住 1.30e- 4
## 3 1 大眾 3.64e- 4
## 4 2 大眾 1.54e- 4
## 5 1 已經 3.18e- 3
## 6 2 已經 2.56e- 3
## 7 1 不進 2.18e-10
## 8 2 不進 1.30e- 4
## 9 1 中央 3.11e- 4
## 10 2 中央 4.08e- 3
## # ... with 13,400 more rows
terms依照各主題的phi值由大到小排序,列出前10大
topics_words %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
mutate(top_words = reorder_within(term,phi,topic)) %>%
ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()#ch3. 尋找最佳主題數
嘗試2、4、6、10、15個主題數,將結果存起來,再做進一步分析。 此部分需要跑一段時間,已經將跑完的檔案存成ldas_result.rdata,可以直接載入
# ldas = c()
# topics = c(2,4,6,10,15)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(dtm, k = topic, control = list(seed = 2021))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "covidtw_result.rdata") # 將模型輸出成檔案
# }載入每個主題的LDA結果
load("covidtw_result.rdata")topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
create LDAvis所需的json function 此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。
topicmodels_json_ldavis <- function(fitted, doc_term){
require(LDAvis)
require(slam)
###以下function 用來解決,主題數多會出現NA的問題
### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi){
jensenShannon <- function(x, y) {
m <- 0.5 * (x + y)
lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
0.5 * sum(lhs) + 0.5 * sum(rhs)
}
dist.mat <- proxy::dist(x = phi, method = jensenShannon)
pca.fit <- stats::cmdscale(dist.mat, k = 2)
data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(doc_term)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
doc.length = as.vector(table(doc_term$i)),
term.frequency = term_freq, mds.method = ls_LDA)
return(json_lda)
}the_lda = ldas[[2]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
serVis(json_res,open.browser = T)serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))the_lda = ldas[[2]] ## 選定topic 為 4 的結果topics_words <- tidy(the_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)## # A tibble: 10 x 3
## topic term phi
## <int> <chr> <dbl>
## 1 3 確診 0.0285
## 2 4 確診 0.0271
## 3 1 確診 0.0259
## 4 3 完整 0.0153
## 5 2 確診 0.0141
## 6 3 新聞 0.0136
## 7 2 個案 0.0133
## 8 3 疫情 0.0123
## 9 4 本土 0.0106
## 10 1 台灣 0.0106
terms依照各主題的phi值由大到小排序
topics_words %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()去除部分詞彙:
removed_word = c("確診","本土","記者","備註","表示","所以","相關","疫情","記者","就是","現在","今天","什麼")
topics_words %>%
filter(!term %in% removed_word) %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()topics_name = c("相關事件","指揮中心","個案足跡","臺灣民情")# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)現在我們看每一篇的文章分佈了!
news_topic %>%
arrange(desc("個案足跡")) %>% head(10) variable.names((2))## NULL
news_topic %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate = format(artDate,'%Y%m%d')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors[c(1,5,8,12)])+
theme(axis.text.x = element_text(angle = 90, hjust = 1))news_topic %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate = format(artDate,'%Y%m%d')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors[c(1,5,8,12)])+
theme(axis.text.x = element_text(angle = 90, hjust = 1))參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation
library(text2vec)##
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
##
## perplexity
library(udpipe)
tokens <- metadata %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)## [1] 309 133
set.seed(2019)
topic_n = 4
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)## INFO [20:32:14.566] early stopping at 110 iteration
## INFO [20:32:14.680] early stopping at 50 iteration
這個比topicmodels的package跑快超多倍
lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字## [,1] [,2] [,3] [,4]
## [1,] "肺炎" "確診" "隔離" "個案"
## [2,] "台大" "現在" "防疫" "疫情"
## [3,] "醫院" "知道" "確診" "完整"
## [4,] "感染" "今天" "消毒" "指揮中心"
## [5,] "萬華" "這樣" "進行" "本土"
## [6,] "國籍" "如果" "學生" "其中"
## [7,] "武漢" "就是" "通知" "新聞"
## [8,] "北部" "什麼" "來源" "新增"
## [9,] "應該" "是不是" "居家" "群聚"
## [10,] "開始" "有沒有" "台北" "網址"
lda_model$plot()## Loading required namespace: servr
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)