Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # For ubuntu
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
# Sys.setlocale("LC_CTYPE", "cht") # For windows.
packages = c("readr","tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot","purrr","ramify","RColorBrewer", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(readr)
require(tm)
require(data.table)
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
require(LDAvis)
require(wordcloud2)
require(webshot)
require(htmlwidgets)
require(servr)
require(purrr)
require(ramify)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
fish_hatePolitics <- fread(file="rawdata.csv",header = T,stringsAsFactors = F,encoding = "UTF-8",data.table = FALSE )
fish_hatePolitics %>%
mutate(year = year(artDate), month = month(artDate)) %>%
group_by(year,month) %>%
summarise(count = n())%>%
ggplot(aes(x=month,y=count))+
geom_bar(stat="identity")+
facet_grid(~year)
1.可以觀察到資料主要分佈去年5-6月以及12月,今年討論聲量則明顯下降。
2.【聲量高點原因推測】
(1)2019年5-6月:韓國瑜剛宣布要參選總統,引發諸如“烙跑市長”、“選上總統,高雄上班”等議題。
(2)2020年12月:鄰近一月中的總統大選,正值“競選活動”與“總統辯論會”的高峰期。
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker(stop_word = "stop_words.txt")
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
new_user_word(jieba_tokenizer,c("韓國瑜","韓國魚","不支持","柯文哲","九二共識","競選活動","罷韓遊行","扶不起的阿斗","高雄市市府","高雄市市長","造勢活動","蔡英文","時代力量")) #自定義斷詞
## [1] TRUE
fish_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1] # 去掉字串長度爲1的詞彙
return(tokens)
}
})
}
tokens <- fish_hatePolitics %>%
unnest_tokens(word, sentence, token=fish_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word) %>%
rename(count=n)
#把名稱統一
tokens$word[which(tokens$word == c("韓國魚","韓董","韓導"))] = "韓國瑜"
tokens$word[which(tokens$word == "郭董")] = "郭台銘"
tokens$word[which(tokens$word == c("柯p","柯P"))] = "柯文哲"
tokens$word[which(tokens$word == "小英")] = "蔡英文"
tokens %>% head(20)
fish_dtm <- tokens %>% cast_dtm(artUrl, word, count) #
fish_dtm
## <<DocumentTermMatrix (documents: 23735, terms: 110521)>>
## Non-/sparse entries: 1347152/2621868783
## Sparsity : 100%
## Maximal term length: 14
## Weighting : term frequency (tf)
inspect(fish_dtm[1:10,1:10]) #查看前十筆資料
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 12/88
## Sparsity : 88%
## Maximal term length: 4
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 九二共識 人物
## https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html 1 1
## https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html 0 0
## Terms
## Docs 力量 土包子 不好
## https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html 2 3 1
## https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html 0 0 0
## Terms
## Docs 中心 內容 分享
## https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html 2 1 1
## https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html 0 1 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html 0 1 0
## Terms
## Docs 午宴 心得
## https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html 1 1
## https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html 0 0
查看DTM矩陣,可以發現是個稀疏矩陣。
fish_lda <- LDA(fish_dtm, k = 2, control = list(seed = 1234))
topics <- tidy(fish_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics
從topics中可以得到特定主題生成特定詞彙的概率。
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
scale_x_reordered()
topics %>%
mutate( topic = paste0("topic",topic)) %>% #根據主題名稱重新命名欄位
spread(topic, beta) %>%
filter( topic1 > .001 |topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))%>%
filter(log_ratio > 10 | log_ratio < -3) %>%
mutate(term = reorder(term, log_ratio)) %>% #依據log_ratio值(主題類別)排序詞項
ggplot(aes(log_ratio, term)) +
geom_col(show.legend = FALSE)
1.上圖中,左側為主題一,右側為主題二。
2.透過上方的兩張圖,感覺兩個主題看起來差不多,沒有明顯的差異,嘗試看看分多一點topics。
# ldas = c()
# topics = c(2,5,8,10,15,25)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(fish_dtm, k = topic, control = list(seed = 1234))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result")
# }
因為需要執行較久,所以已將主題結果存在lda_result
load("ldas_result")
topics = c(2,5,8,10,15,25)
data_frame(k = topics,
perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
perplexity 越小越好,但是太小的話,主題數會分太細。通常會找一個主題數適當,且perplexity比較低的主題。 因此,在後續分析時,本組將分為 “10個”主題。
fish_lda = ldas[[4]] ## 選定topic 為10 的結果
topics <- tidy(fish_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics
每一行代表一個主題中的一個詞彙
#取出每一個Topic中生成概率最高(beta值最高)的10個詞彙
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
#繪製長條圖
top_terms %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
scale_x_reordered()
可以看到topic都被一開始所使用的搜尋關鍵字影響看不出每一群的差異。
remove_word = c("韓國瑜","高雄","韓粉","國民黨" ,"總統","已經" ,"市長","根本" ,"是不是","一直","新聞","報導","韓導","韓總")
top_terms <- topics %>%
filter(!term %in% remove_word)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
scale_x_reordered()
可以看出每個主題主要在討論什麼了!
topic_name = c('罷韓聯署','媒體資訊','高雄現況','黨內初選','競選行程','豪宅風雲','防疫議題','國際局勢','令人詬病的言論','直播狂熱粉')
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(fish_lda)
doc_pro <- tmResult$topics
dim(doc_pro) # nDocs(DTM) distributions over K topics
## [1] 23735 10
每篇文章都有topic的分佈,所以總共是:23735筆的文章*10個主題
查看每一篇文章的各個主題組成比率
# get document topic proportions
document_topics <- doc_pro[fish_hatePolitics$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
fish_topic = cbind(fish_hatePolitics,document_topics_df)
fish_topic %>%
arrange(desc(`媒體資訊`)) %>%
head(10)
可以看到“直播狂熱粉”這個主題主要涵蓋了“韓國瑜的狂熱粉絲 - 林小姐,透過直播的方式,在各式議題上為韓國瑜發聲”。
fish_topic[,c(11:20)] =sapply(fish_topic[,c(11:20)] , as.numeric)
fish_topic$artDate= fish_topic$artDate %>% as.Date("%Y/%m/%d")
fish_topic %>%
group_by(artDate = format(artDate,"%Y%m")) %>%
summarise_if(is.double, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
由於我們在今年2月~4月的資料太少,所以將這三個月去除。
fish_topic %>%
group_by(artDate = format(artDate,"%Y%m")) %>%
filter(artDate < 202002) %>%
summarise_if(is.double, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
可以看出每個月的聲量,但是不能很清楚出每個月的比例
fish_topic %>%
group_by(artDate = format(artDate,"%Y%m")) %>%
filter(artDate < 202002) %>%
summarise_if(is.double, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
現在我們可以看到每個月主題的佔比了!