library(data.table)
## Warning: package 'data.table' was built under R version 4.0.4
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.4
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.4
library(stringr)
library(tm)
## Warning: package 'tm' was built under R version 4.0.5
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(servr)
## Warning: package 'servr' was built under R version 4.0.5
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 4.0.5
library(purrr)
## Warning: package 'purrr' was built under R version 4.0.4
##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
require(RColorBrewer)
## Loading required package: RColorBrewer
require(tidyr)
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 4.0.4
require(servr)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
Ptt : 八卦版、武漢肺炎版 時間 : 2020/1/1-2021/5/15 關鍵字 : 陳時中 目的 : 與陳時中一同出現的資料分類如何,有哪些資料
metadata <- fread("tttttt_articleMetaData.csv", encoding = "UTF-8")
metadata %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="red")+
geom_point()
> 關於陳時中的討論在2020-3-26達最高峰,另一起高峰在2020-8-19~2020-8-31,最低是在2020年7月,是疫情較不嚴重的區段。
#Ch1. Document Term Matrix (DTM)
移除PTT貼新聞時會出現的格式用字
metadata <- metadata %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|張貼問卦請注意|充實文章內容|是否有專板|本板並非萬能問板|一天只能張貼|自刪及被刪也算兩篇之內|超貼者將被水桶|本看板嚴格禁止政治問卦|發文問卦前請先仔細閱讀相關板規|未滿30繁體中文字水桶3個月|嚴重者以鬧板論", "", sentence))
bigram
jieba_tokenizer = worker()
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
metadata_bigram <- metadata %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
metadata_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
count(bigram, sort = TRUE)
trigram
jieba_trigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
ngram<- ngrams(unlist(tokens), 3)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
}
})
}
metadata_trigram <- metadata %>%
unnest_tokens(ngrams, sentence, token = jieba_trigram)
metadata_trigram %>%
filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
count(ngrams, sort = TRUE)
Remove stop words in bigram
# load stop words
stop_words <- scan(file = "./stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "./stop_words.txt", what = character(), sep = "\n", : 輸
## 入連結 './stop_words.txt' 中的輸入不正確
metadata_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
count(word1, word2, sort = TRUE) %>%
unite_("bigram", c("word1","word2"), sep=" ")
Remove the stopwords in trigram
metadata_trigram %>%
filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
count(word1, word2, word3, sort = TRUE) %>%
unite_("ngrams", c("word1", "word2", "word3"), sep=" ")
使用自建字典及停用字字典
jieba_tokenizer = worker(user="word.txt",stop_word="stop_words.txt")
news_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
計算每篇文章各token出現次數
tokens <- metadata %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter((!str_detect(word, regex("[0-9a-zA-Z]")))) %>%
count(artUrl, word) %>%
rename(count=n)
tokens %>% head(20)
dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm
## <<DocumentTermMatrix (documents: 3546, terms: 41345)>>
## Non-/sparse entries: 340694/146268676
## Sparsity : 100%
## Maximal term length: 10
## Weighting : term frequency (tf)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 21/79
## Sparsity : 79%
## Maximal term length: 2
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 一出 人口 人物 上午
## https://www.ptt.cc/bbs/Gossiping/M.1580201716.A.D4A.html 1 1 2 1
## https://www.ptt.cc/bbs/Gossiping/M.1580218136.A.251.html 0 0 2 0
## https://www.ptt.cc/bbs/Gossiping/M.1580228124.A.03A.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580473375.A.390.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580473872.A.066.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580621894.A.4DE.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580632630.A.857.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580637593.A.E42.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580638519.A.26D.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580641092.A.CD8.html 0 0 0 0
## Terms
## Docs 口罩 工作 不當 中央
## https://www.ptt.cc/bbs/Gossiping/M.1580201716.A.D4A.html 6 1 2 1
## https://www.ptt.cc/bbs/Gossiping/M.1580218136.A.251.html 1 0 1 0
## https://www.ptt.cc/bbs/Gossiping/M.1580228124.A.03A.html 4 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580473375.A.390.html 1 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580473872.A.066.html 0 0 0 3
## https://www.ptt.cc/bbs/Gossiping/M.1580621894.A.4DE.html 14 1 0 3
## https://www.ptt.cc/bbs/Gossiping/M.1580632630.A.857.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580637593.A.E42.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580638519.A.26D.html 0 0 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1580641092.A.CD8.html 0 0 0 0
## Terms
## Docs 介入 內部
## https://www.ptt.cc/bbs/Gossiping/M.1580201716.A.D4A.html 2 1
## https://www.ptt.cc/bbs/Gossiping/M.1580218136.A.251.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580228124.A.03A.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580473375.A.390.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580473872.A.066.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580621894.A.4DE.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580632630.A.857.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580637593.A.E42.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580638519.A.26D.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1580641092.A.CD8.html 0 0
lda <- LDA(dtm, k = 2, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs") #調整alpha即delta
lda
## A LDA_Gibbs topic model with 2 topics.
topics_words <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words
terms依照各主題的phi值由大到小排序,列出前10大
topics_words %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
mutate(top_words = reorder_within(term,phi,topic)) %>%
ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
Topic 1 可能與武漢疫情相關,Topic 2 可能與開放美豬相關
#ch3. 尋找最佳主題數
嘗試2、3、4、5個主題數,將結果存起來,再做進一步分析。 此部分需要跑一段時間,已經將跑完的檔案存成ldas_result.rdata,可以直接載入
# ldas = c()
# topics = c(2,3,4,5)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(dtm, k = topic, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs")
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
# }
載入每個主題的LDA結果
load("ldas_result.rdata")
the_lda = ldas[[3]] ## 選定topic 為 4 的結果
topics_words <- tidy(the_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)
terms依照各主題的phi值由大到小排序
topics_words %>%
group_by(topic) %>%
top_n(15, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
去除共通詞彙,
removed_word = c("陳時中","目前","部長","市長","疫情","台北")
topics_words %>%
filter(!term %in% removed_word) %>%
group_by(topic) %>%
top_n(15, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
topics_name = c("檢疫狀況","陳時中個人新聞","疫苗接種","美豬事件")
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)
現在我們看每一篇的文章分佈了!
news_topic %>%
arrange(desc(`檢疫狀況`)) %>%head(20)
news_topic %>%
arrange(desc(`陳時中個人新聞`)) %>%head(20)
news_topic %>%
arrange(desc(`疫苗接種`)) %>%head(20)
news_topic %>%
arrange(desc(`美豬事件`)) %>%head(20)
news_topic %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate") %>%
filter(variable != c("commentNum")) %>%
filter(variable != c("push")) %>%
filter(variable != c("boo")) %>%
ggplot( aes(x=artDate, y=value, fill = variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors[c(1,3,5,7)])+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
news_topic %>%
mutate(artDate = as.Date(artDate)) %>%
#filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
filter(variable != c("commentNum")) %>%
filter(variable != c("push")) %>%
filter(variable != c("boo")) %>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors[c(1,3,5,7)])+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# > topic 檢疫狀況:
# 皆佔有一定比例,唯獨2020 9月討論稍低,推測可能是疫情情況好轉且美豬事件討論度高
#
# > topic 陳時中個人新聞:
# 2020 9月、2021 3月登小巨蛋開金嗓
# 2020 5月墾丁視察防疫行程
# 2020 3月兒子設計"洗手罷韓T"
#
# > topic 疫苗接種:
# 2021 2月後討論度較高,購買 AZ、BNT 疫苗之討論
#
# > topic 美豬事件:
# 2020 9,10,11月美豬事件討論度最高
參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation
library(text2vec)
## Warning: package 'text2vec' was built under R version 4.0.5
##
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
##
## perplexity
library(udpipe)
## Warning: package 'udpipe' was built under R version 4.0.5
jieba_tokenizer = worker(user="word.txt",stop_word="stop_words.txt")
news_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
another_tokens <- metadata %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))
dtf <- document_term_frequencies(another_tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)
## [1] 3546 2468
set.seed(2019)
topic_n = 6
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)
## INFO [18:37:03.552] early stopping at 250 iteration
## INFO [18:37:06.408] early stopping at 40 iteration
這個比topicmodels的package跑快超多倍
lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] "口罩" "柯文哲" "衛福部" "疫苗" "台灣" "確診"
## [2,] "疫情" "市長" "健保" "美豬" "防疫" "採檢"
## [3,] "戴口罩" "陳時中" "陳時中" "進口" "陳時中" "個案"
## [4,] "旅遊" "民進黨" "紓困" "標示" "部長" "隔離"
## [5,] "肺炎" "蘇貞昌" "討論" "開放" "疫情" "醫院"
## [6,] "日本" "中央" "政策" "接種" "網友" "感染"
## [7,] "國家" "台北市" "爭議" "豬肉" "墾丁" "檢疫"
## [8,] "管制" "侯友宜" "溝通" "萊克多巴胺" "感謝" "陰性"
## [9,] "措施" "政治" "地方" "瘦肉精" "記者" "症狀"
## [10,] "民眾" "防疫" "保費" "施打" "合作" "檢驗"
lda_model$plot()
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)