[1] "LC_CTYPE=zh_TW.UTF-8;LC_NUMERIC=C;LC_TIME=zh_TW.UTF-8;LC_COLLATE=zh_TW.UTF-8;LC_MONETARY=zh_TW.UTF-8;LC_MESSAGES=zh_HK.UTF-8;LC_PAPER=zh_TW.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=zh_TW.UTF-8;LC_IDENTIFICATION=C"
packages = c("readr","tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis","data", "webshot","purrr","ramify","RColorBrewer", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)require(readr)
require(tm)
require(data.table)
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
require(LDAvis)
require(wordcloud2)
require(webshot)
require(htmlwidgets)
require(servr)
require(purrr)
require(ramify)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)udn= read.csv("raw_data/udn.csv",stringsAsFactors = FALSE) %>% mutate(source = "udn")
ebc= read.csv("raw_data/ebc.csv",stringsAsFactors = FALSE)%>% mutate(source = "ebc")
chinatimes= read.csv("raw_data/chinatimes.csv",stringsAsFactors = FALSE)%>% mutate(source = "chinatimes")
news=rbind(udn,rbind(ebc,chinatimes))
news$artDate = as.Date(news$artDate)news %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="red")+
geom_point()可以觀察到資料主要分佈在 1月底之後
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
news_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}tokens <- news %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word) %>%
rename(count=n)
tokens %>%head(20)<<DocumentTermMatrix (documents: 22803, terms: 145608)>>
Non-/sparse entries: 3192554/3317106670
Sparsity : 100%
Maximal term length: 16
Weighting : term frequency (tf)
<<DocumentTermMatrix (documents: 10, terms: 10)>>
Non-/sparse entries: 25/75
Sparsity : 75%
Maximal term length: 4
Weighting : term frequency (tf)
Sample :
Terms
Docs 癌症 癌症病人 安慰 半夜 辦法 爆發 北部 比較 表示 病床
https://news.ebc.net.tw/news/article/189213 2 1 1 1 1 1 1 1 1 5
https://news.ebc.net.tw/news/article/192073 0 0 0 0 0 2 0 0 1 0
https://news.ebc.net.tw/news/article/192142 0 0 0 0 0 1 0 0 3 0
https://news.ebc.net.tw/news/article/192157 0 0 0 0 0 0 0 0 1 0
https://news.ebc.net.tw/news/article/192475 0 0 0 0 0 1 0 0 2 0
https://news.ebc.net.tw/news/article/192528 0 0 0 0 0 0 0 0 0 0
https://news.ebc.net.tw/news/article/192934 1 0 0 0 0 0 1 0 4 0
https://news.ebc.net.tw/news/article/193023 0 0 0 0 0 1 0 0 1 0
https://news.ebc.net.tw/news/article/193107 0 0 0 0 0 0 0 0 1 0
https://news.ebc.net.tw/news/article/193183 0 0 0 0 0 0 0 3 3 0
查看DTM矩陣,可以發現是個稀疏矩陣。
從topics中可以得到特定主題生成特定詞彙的概率。
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()透過上方的圖,感覺兩個主題看起來差不多,沒有看出兩者的差異,嘗試看看分多一點topics
# ldas = c()
# topics = c(2,3,10,25,36)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(news_dtm, k = topic, control = list(seed = 2020))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result")
# }這邊要跑N個小時,已將主題結果存在lda_result
topics = c(2,3,10,25,36)
data_frame(k = topics,
perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")`data_frame()` is deprecated as of tibble 1.1.0.
Please use `tibble()` instead.
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39m
perplexity 越小越好,但是太小的話,主題數會分太細。通常會找一個主題數適當,且perplexity比較低的主題
Maximization: Deveaud2014、Griffiths2004
if(!('ldatuning' %in% existing)){install.packages(ldatuning)}
library("ldatuning")
result <- FindTopicsNumber(
news_dtm,
topics = topics,
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 2020),
mc.cores = 2L,
verbose = TRUE
)
FindTopicsNumber_plot(result)這邊也要跑N個小時,可以參考上面的連結了解如何使用
topicmodels_json_ldavis <- function(fitted, doc_term){
require(LDAvis)
require(slam)
###以下function 用來解決,主題數多會出現NA的問題
## 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi)
{
jensenShannon <- function(x, y) {
m <- 0.5 * (x + y)
lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
0.5 * sum(lhs) + 0.5 * sum(rhs)
}
dist.mat <- proxy::dist(x = phi, method = jensenShannon)
pca.fit <- stats::cmdscale(dist.mat, k = 2)
data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(doc_term)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
doc.length = as.vector(table(doc_term$i)),
term.frequency = term_freq, mds.method = ls_LDA)
return(json_lda)
}# 設置alpha及delta參數
#devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))
####### 以下用來產生ldavis的檔案,可以之後用來在local端、放在網路上打開 ##########
# for(lda in ldas){
#
# k = lda@k ## lda 主題數
# if(k==2){next}
# json_res <- topicmodels_json_ldavis(lda,news_dtm)
# # serVis(json_res,open.browser = T)
# lda_dir = paste0(k,"_ldavis")
# if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}
#
# serVis(json_res, out.dir =lda_dir, open.browser = F)
#
# writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
# }
topic_10 = ldas[[3]]
json_res <- topicmodels_json_ldavis(topic_10,news_dtm)
serVis(json_res,open.browser = T)
# 如果無法開啟視窗(windows用戶)可執行這段
# serVis(json_res, out.dir = "vis", open.browser = T)
# writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))news_lda = ldas[[3]] ## 選定topic 為10 的結果
topics <- tidy(news_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip()可以看到topic都被一開始所使用的搜尋關鍵字影響看不出每一群的差異。
remove_word = c("疫情","武漢","防疫","確診","防疫","肺炎","冠狀病毒","病毒","新冠")
top_terms <- topics %>%
filter(!term %in% remove_word)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip()可以看出每個主題主要在討論什麼了!
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(news_lda)
doc_pro <- tmResult$topics
dim(doc_pro) # nDocs(DTM) distributions over K topics[1] 22803 10
每篇文章都有topic的分佈,所以22803筆的文章*10個主題
# get document topic proportions
document_topics <- doc_pro[news$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
news_topic = cbind(news,document_topics_df)
# news_topic %>% head(10)現在我們看每一篇的文章分佈了!
可以看到國際關係這個主題主要討論 美中關係、台灣與WHO,以及其他的國際關係
news_topic[,c(7:16)] =sapply(news_topic[,c(7:16)] , as.numeric)
news_topic %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))由於我們的資料,12、5月資料太少,所以將這兩個月去除,以及None的主題
news_topic %>%
filter( !format(artDate,'%Y%m') %in% c(201912,202005))%>%
dplyr::select(-None)%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))可以看出每個月的聲量,但是不能很清楚出每個月的比例
news_topic %>%
filter( !format(artDate,'%Y%m') %in% c(201912,202005))%>%
dplyr::select(-None)%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))現在我們可以看到每個月主題的佔比了!
library(text2vec)
library(udpipe)
tokens <- news %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)[1] 22803 13264
set.seed(2019)
topic_n = 10
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)這個比topicmodels的package跑快超多倍
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] "警方" "業者" "醫院" "中國" "經濟" "防疫" "企業" "專題" "病毒" "自己"
[2,] "透過" "人潮" "確診" "美國" "市場" "口罩" "申請" "數據" "研究" "網友"
[3,] "活動" "遊客" "症狀" "大陸" "成長" "居家" "員工" "圖表" "可能" "口罩"
[4,] "訊息" "餐廳" "武漢" "疫情" "影響" "檢疫" "勞工" "確診" "疫苗" "大家"
[5,] "捐贈" "飯店" "患者" "武漢" "投資" "中央" "取消" "台灣" "認為" "因為"
[6,] "分局" "連假" "感染" "防控" "生產" "市府" "方案" "肺炎" "專家" "沒有"
[7,] "網路" "民眾" "病例" "國家" "營收" "學校" "補助" "疫情" "藥物" "現在"
[8,] "學習" "觀光" "肺炎" "北京" "指數" "人員" "銀行" "新冠" "一個" "他們"
[9,] "員警" "酒店" "出現" "川普" "衝擊" "消毒" "貸款" "案例" "新冠" "我們"
[10,] "平台" "消費者" "通報" "冠狀病毒" "億元" "市長" "計畫" "圖解" "可以" "真的"
To stop the server, run servr::daemon_stop(1) or restart your R session
Serving the directory /tmp/Rtmpu71thE/file7a3f5167e183 at http://127.0.0.1:4321