## [1] ""
## [1] "Chinese (Traditional)_Taiwan.950"
packages = c("readr", "tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot", "purrr", "ramify", "RColorBrewer", "htmlwidgets", "servr", "igraph")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# 載入packages
require(readr)
require(tm)
require(data.table)
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
require(LDAvis)
require(wordcloud2)
require(webshot)
require(htmlwidgets)
require(servr)
require(purrr)
require(ramify)
require(RColorBrewer)
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
# 文章資料
Detention <- fread("Detention.csv", encoding = "UTF-8")
Detention$artDate = Detention$artDate %>% as.Date("%Y/%m/%d") # 將日期欄位格式由chr轉為date
# 回覆資料
Detention_review <- fread("Detention_articleReviews.csv", encoding = "UTF-8")
# 選取需要的欄位
Detention_review <- Detention_review %>%
select(artUrl, cmtPoster, cmtStatus, cmtContent)
Detention_review
## artUrl cmtPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html franz10123
## 2: https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html kawazakiz2
## 3: https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html Tchachavsky
## 4: https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html dcoog7880
## 5: https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html AustinRivers
## ---
## 43771: https://www.ptt.cc/bbs/movie/M.1581865305.A.72B.html thinker123
## 43772: https://www.ptt.cc/bbs/movie/M.1585740803.A.313.html aehvtleo
## 43773: https://www.ptt.cc/bbs/movie/M.1585740803.A.313.html mysmalllamb
## 43774: https://www.ptt.cc/bbs/movie/M.1586164182.A.DDF.html engineer1
## 43775: https://www.ptt.cc/bbs/movie/M.1586164182.A.DDF.html Shawnny
## cmtStatus cmtContent
## 1: 推 :樓下噓撕裂族群
## 2: 推 :教官:馬英九飾演
## 3: 推 :抵制為什麼不找馬英九拉
## 4: 推 :推推
## 5: → :馬英九演教官我就10刷
## ---
## 43771: 噓 :哎啊
## 43772: → :難怪覺得眼熟...這兩年前的片了
## 43773: 推 :不錯呀,本來台灣沒興趣的片都出籠了,還有日暮!
## 43774: 推 :支持啊。人少就去看
## 43775: 推 :看都看
Detention %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot(aes(artDate, count))+
geom_line(color = "blue", size = 1) +
theme_classic()
可觀察資料主要分佈在「2019/09」返校電影上映之後
length(unique(Detention$artPoster))
## [1] 470
length(unique(Detention_review$cmtPoster))
## [1] 12696
allPoster <- c(Detention$artPoster, Detention_review$cmtPoster)
length(unique(allPoster))
## [1] 12909
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
Detention_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
stop_words <- c("可以","一個","沒有","覺得","我們","因為","就是","什麼")
tokens <- filter_segment(tokens, stop_words)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
tokens <- Detention %>%
unnest_tokens(word, sentence, token=Detention_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word) %>%
rename(count=n)
tokens %>% head(10)
## # A tibble: 10 x 3
## artUrl word count
## <chr> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 一日 1
## 2 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 一段 1
## 3 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 一線 1
## 4 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 不會 1
## 5 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 中學 1
## 6 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 之前 1
## 7 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 內容 2
## 8 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 公關 1
## 9 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 升旗 1
## 10 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 心理 1
Detention_dtm <- tokens %>% cast_dtm(artUrl, word, count)
Detention_dtm
## <<DocumentTermMatrix (documents: 612, terms: 19188)>>
## Non-/sparse entries: 87588/11655468
## Sparsity : 99%
## Maximal term length: 7
## Weighting : term frequency (tf)
inspect(Detention_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 14/86
## Sparsity : 86%
## Maximal term length: 2
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 一日 一段 一線 不會
## https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 1 1 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1565853117.A.D62.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1565960010.A.608.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1566113530.A.383.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1566212248.A.420.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567267079.A.BDC.html 0 0 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1567507189.A.DF6.html 0 0 0 2
## https://www.ptt.cc/bbs/Gossiping/M.1567574321.A.0D8.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567694438.A.6C7.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567735991.A.1FB.html 0 1 0 0
## Terms
## Docs 中學 之前 內容 公關
## https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 1 1 2 1
## https://www.ptt.cc/bbs/Gossiping/M.1565853117.A.D62.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1565960010.A.608.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1566113530.A.383.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1566212248.A.420.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567267079.A.BDC.html 0 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567507189.A.DF6.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567574321.A.0D8.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567694438.A.6C7.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567735991.A.1FB.html 0 0 0 0
## Terms
## Docs 升旗 心理
## https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1565853117.A.D62.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1565960010.A.608.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1566113530.A.383.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1566212248.A.420.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567267079.A.BDC.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567507189.A.DF6.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567574321.A.0D8.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567694438.A.6C7.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1567735991.A.1FB.html 0 0
查看DTM矩陣,可以發現是個稀疏矩陣。
lda <- LDA(Detention_dtm, k = 2, control = list(seed = 2020))
topics <- tidy(lda, matrix = "beta") #在tidy function裡使用 beta 來取出 Phi 矩陣。
topics
## # A tibble: 38,376 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 一日 0.0000224
## 2 2 一日 0.0000261
## 3 1 一段 0.0000578
## 4 2 一段 0.000739
## 5 1 一線 0.0000420
## 6 2 一線 0.0000800
## 7 1 不會 0.00144
## 8 2 不會 0.00199
## 9 1 中學 0.000810
## 10 2 中學 0.000675
## # ... with 38,366 more rows
從 topics 中可以得到「特定主題生成特定詞彙」的概率。
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
觀察上方的圖,較難看出兩者的差異,因此我們嘗試區分更多種的topics
ldas = c()
topics = c(2,3,10,25,36)
for(topic in topics){
start_time <- Sys.time()
lda <- LDA(Detention_dtm, k = topic, control = list(seed = 2020))
ldas =c(ldas,lda)
print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
save(ldas,file = "ldas_Detentionresult")
}
將主題結果存在 ldas_Detentionresult
load("ldas_Detentionresult")
topics = c(2,3,10,25,36)
data_frame(k = topics,
perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
接著選擇一個主題數適當且 perplexity 較低的主題
topicmodels_json_ldavis <- function(fitted, doc_term){
require(LDAvis)
require(slam)
ls_LDA = function (phi)
{
jensenShannon <- function(x, y) {
m <- 0.5 * (x + y)
lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
0.5 * sum(lhs) + 0.5 * sum(rhs)
}
dist.mat <- proxy::dist(x = phi, method = jensenShannon)
pca.fit <- stats::cmdscale(dist.mat, k = 2)
data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(doc_term)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
doc.length = as.vector(table(doc_term$i)),
term.frequency = term_freq, mds.method = ls_LDA)
return(json_lda)
}
topic_10 = ldas[[3]]
json_res <- topicmodels_json_ldavis(topic_10,Detention_dtm)
serVis(json_res,open.browser = T)
serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))
Detention_lda = ldas[[3]] ## 選定topic 為10 的結果
topics <- tidy(Detention_lda, matrix = "beta") # 注意,在tidyfunction裡面要使用"beta"來取出Phi矩陣。
topics
## # A tibble: 192,380 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 大家 0.00344
## 2 2 大家 0.00214
## 3 3 大家 0.00239
## 4 4 大家 0.00163
## 5 5 大家 0.00109
## 6 6 大家 0.00184
## 7 7 大家 0.00273
## 8 8 大家 0.00111
## 9 9 大家 0.00317
## 10 10 大家 0.00302
## # ... with 192,370 more rows
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip()
可以看到topic都被一開始所使用的搜尋關鍵字影響看不出每一群的差異。
remove_word = c("返校","電影","自己","遊戲")
top_terms <- topics %>%
filter(!term %in% remove_word)%>%
group_by(topic) %>%
top_n(8, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip()
可以較清楚看出每個主題主要在討論什麼
topic_name = c("禁書",'劇情','兩岸歷史','政治、社會','電玩改編','None1','票房','金馬獎','None2','None3')
定義出7個主題:禁書、劇情、兩岸歷史、政治與社會、電玩改編、票房、金馬獎
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(Detention_lda)
doc_pro <- tmResult$topics
dim(doc_pro) # nDocs(DTM) distributions over K topics
## [1] 613 10
每篇文章都有topic的分佈,所以613筆的文章*10個主題
# get document topic proportions
document_topics <- doc_pro[Detention$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
news_topic = cbind(Detention,document_topics_df)
news_topic %>%
arrange(desc(`禁書`)) %>% head(10)
可以看到禁書這個主題主要在探討歷史上禁書、政府打壓文論自由的議題
#news_topic[,c(11:20)] = sapply(news_topic[,c(11:20)] , as.numeric)
news_topic %>%
dplyr::select(-commentNum,-push,-boo)%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
移除資料較少的月份,以及None的主題
news_topic %>%
filter( !format(artDate,'%Y%m') %in% c(201912,202002,202003,202004))%>%
dplyr::select(-None1,-commentNum,-push,-boo,-None2,-None3)%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
可以看出每個月的聲量,但不能很清楚出每個月的比例
news_topic %>%
filter( !format(artDate,'%Y%m') %in% c(201912,202002,202003,202004))%>%
dplyr::select(-None1,-commentNum,-push,-boo,-None2,-None3)%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))