#Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # For ubuntu
Sys.setlocale("LC_CTYPE", "cht") # For windows.## [1] "Chinese (Traditional)_Taiwan.950"
packages = c("readr","tm", "data.table", "dplyr", "stringr","tidyverse", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot","purrr","ramify","RColorBrewer","wordcloud2", "htmlwidgets","servr","scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(pacman)require(readr)
require(tm)
require(data.table)
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
require(LDAvis)
require(wordcloud2)
require(webshot)
require(htmlwidgets)
require(servr)
require(purrr)
require(ramify)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)fish_hatePolitics %>%
mutate(year = year(artDate), month = month(artDate)) %>%
group_by(year,month) %>%
summarise(count = n())%>%
ggplot(aes(x=month,y=count))+
geom_bar(stat="identity")+
facet_grid(~year)1.可以觀察到資料主要分佈去年5-6月以及12月,今年討論聲量則明顯下降。
2.【聲量高點原因推測】
(1)2019年5-6月:韓國瑜剛宣布要參選總統,引發諸如“烙跑市長”、“選上總統,高雄上班”等議題。
(2)2020年12月:鄰近一月中的總統大選,正值“競選活動”與“總統辯論會”的高峰期。
初始化斷詞引擎,並加入停用字
jieba_tokenizer = worker(stop_word = "stop_words.txt")
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")自定義斷詞
## [1] TRUE
去掉字串長度爲1的詞彙
fish_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}過濾特殊字元
tokens <- fish_hatePolitics %>%
mutate(id=c(1:nrow(fish_hatePolitics))) %>%
unnest_tokens(word, sentence, token=fish_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))把名稱統一
## Warning in tokens$word == c("韓國魚", "韓董", "韓導", "韓總"): 較長的物件長度並
## 非較短物件長度的倍數
tokens$word[which(tokens$word == "郭董")] = "郭台銘"
tokens$word[which(tokens$word == c("柯p","柯P"))] = "柯文哲"## Warning in tokens$word == c("柯p", "柯P"): 較長的物件長度並非較短物件長度的倍數
查看前20筆資料
## artTitle artDate artTime
## 1 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 2 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 3 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 4 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 5 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 6 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 7 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 8 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 9 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 10 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 11 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 12 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 13 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 14 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 15 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 16 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 17 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 18 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 19 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 20 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## artUrl artPoster
## 1 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 2 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 3 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 4 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 5 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 6 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 7 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 8 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 9 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 10 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 11 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 12 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 13 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 14 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 15 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 16 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 17 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 18 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 19 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## 20 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html WER0930
## artCat commentNum push boo id word
## 1 HatePolitics 84 18 3 1 以為
## 2 HatePolitics 84 18 3 1 韓國瑜
## 3 HatePolitics 84 18 3 1 兵法
## 4 HatePolitics 84 18 3 1 高雄
## 5 HatePolitics 84 18 3 1 這局
## 6 HatePolitics 84 18 3 1 棋中
## 7 HatePolitics 84 18 3 1 力挽狂瀾
## 8 HatePolitics 84 18 3 1 我本
## 9 HatePolitics 84 18 3 1 認為
## 10 HatePolitics 84 18 3 1 要選
## 11 HatePolitics 84 18 3 1 總統
## 12 HatePolitics 84 18 3 1 意思
## 13 HatePolitics 84 18 3 1 最好
## 14 HatePolitics 84 18 3 1 時機
## 15 HatePolitics 84 18 3 1 屁股
## 16 HatePolitics 84 18 3 1 還沒
## 17 HatePolitics 84 18 3 1 坐熱
## 18 HatePolitics 84 18 3 1 豈不是
## 19 HatePolitics 84 18 3 1 朱立倫
## 20 HatePolitics 84 18 3 1 前車之鑑
計算所有字在“單一文集”中的總詞頻
計算所有字在“所有文集”中的總詞頻
tokens_count <- word_count %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
tokens %>%
count(word, sort = TRUE) %>%
top_n(10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
ylab("出現次數") +
coord_flip()## Selecting by n
詞頻文字雲
總統大選前文字雲
word_count %>%
filter(artDate == as.Date('2020-01-11')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>20) %>% # 過濾出現太少次的字
wordcloud2()wordcloud
總統大選後文字雲
word_count %>%
filter(artDate == as.Date('2020-01-13')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>20) %>% # 過濾出現太少次的字
wordcloud2()wordcloud
一月十一日總統大選後,“罷免”字眼成為熱門
word_count %>%
filter(word == "罷免" | word =="下台" | word == "罷韓" | word == "光復高雄") %>%
ggplot(aes(x=artDate,y=count))+
geom_bar(stat = "identity")+
scale_x_date(date_breaks = "1 month", date_labels = "%m/%d") “罷免”達到巔峰的十個日子
word_count %>%
filter(word == "罷免" | word =="下台" | word == "罷韓" | word == "光復高雄") %>%
group_by(artDate = artDate) %>%
summarise(count=sum(count)) %>%
arrange(desc(count)) %>%
slice(1:10)## # A tibble: 10 x 2
## artDate count
## <date> <int>
## 1 2020-04-08 110
## 2 2020-01-13 91
## 3 2019-12-22 89
## 4 2020-01-11 89
## 5 2019-12-21 84
## 6 2020-01-30 74
## 7 2020-01-14 72
## 8 2019-09-13 70
## 9 2019-12-11 69
## 10 2019-06-27 65
2020年04月:韓國瑜遞狀聲請停止執行罷免案、提出普篩制度、提出海軍官兵若拒絕疫調將開罰
2020年01月:總統大選 2019年12月:總統大選辯論會、韓國瑜上博恩夜夜秀、wecare 大遊行
2019年09月:韓國瑜否定“挖石油”言論、郭台銘退出國民黨
2019年06月:韓國瑜請假表公開、學生當面嗆韓
word_count %>%
filter(word == "草包") %>%
ggplot(aes(x=artDate,y=count))+
geom_bar(stat = "identity")+
scale_x_date(date_breaks = "1 month", date_labels = "%m/%d") “草包”達到巔峰的十個日子
word_count %>%
filter(word == "草包") %>%
group_by(artDate) %>%
summarise(count=sum(count)) %>%
arrange(desc(count)) %>%
slice(1:10)## # A tibble: 10 x 2
## artDate count
## <date> <int>
## 1 2019-05-13 35
## 2 2019-12-01 33
## 3 2019-05-03 31
## 4 2019-05-06 31
## 5 2019-05-04 28
## 6 2019-05-17 27
## 7 2019-05-07 26
## 8 2020-01-11 22
## 9 2019-08-29 21
## 10 2019-12-03 21
2019年05月:草包之歌流傳
2019年12月:總統大選辯論會、韓國瑜上博恩夜夜秀、wecare 大遊行
2020年01月:總統大選
2019年08月:晶晶體風波
全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版
# 正向字典txt檔
# 以,將字分隔
P <- read_file("positive.txt")
# 負向字典txt檔
N <- read_file("negative.txt")
#字典txt檔讀進來是一個字串
typeof(P)## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive",stringsAsFactors = F)
N = data.frame(word = N, sentiment = "negative",stringsAsFactors = F)
LIWC = rbind(P, N)文集中的字出現在LIWC字典中是屬於positive還是negative
tokens_count %>%
inner_join(LIWC) %>%
arrange(desc(count)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(sentiment) %>%
top_n(n=10,wt=count) %>%
ungroup() %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = TRUE) +
labs(x = NULL, y = "詞頻") +
facet_wrap(~sentiment, ncol = 2, scales = "free") +
coord_flip()#轉置## Joining, by = "word"
統計每天的文章正面字的次數與負面字的次數
sentiment_count <- word_count %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))## Joining, by = "word"
過去一年的情緒起伏
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date( date_labels = "%m/%d") 1月11日總統大選後,情緒起伏
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(date_labels = "%m/%d",limits = as.Date(c("2020-01-01","2020-03-01")))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/01/11')) [1]])),colour = "black",linetype="dashed",alpha=.3) ## Warning: Removed 572 row(s) containing missing values (geom_path).
tokens_dtm <- tokens %>%
count(artUrl, word) %>%
rename(count=n)
fish_dtm <- tokens_dtm %>%
cast_dtm(artUrl, word, count)
fish_dtm## <<DocumentTermMatrix (documents: 23735, terms: 110456)>>
## Non-/sparse entries: 1317797/2620355363
## Sparsity : 100%
## Maximal term length: 14
## Weighting : term frequency (tf)
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 12/88
## Sparsity : 88%
## Maximal term length: 4
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 九二共識 人物
## https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html 1 1
## https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html 0 0
## Terms
## Docs 力量 土包子 不好
## https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html 2 3 1
## https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html 0 0 0
## Terms
## Docs 中心 內容 分享
## https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html 2 1 1
## https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html 0 1 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html 0 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html 0 1 0
## Terms
## Docs 午宴 心得
## https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html 1 1
## https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html 0 0
## https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html 0 0
查看DTM矩陣,可以發現是個稀疏矩陣。
## # A tibble: 220,912 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 九二共識 0.000114
## 2 2 九二共識 0.000170
## 3 1 人物 0.000654
## 4 2 人物 0.000865
## 5 1 力量 0.000645
## 6 2 力量 0.000120
## 7 1 土包子 0.0000430
## 8 2 土包子 0.0000439
## 9 1 不好 0.000474
## 10 2 不好 0.000774
## # ... with 220,902 more rows
從topics中可以得到特定主題生成特定詞彙的概率。
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
scale_x_reordered()topics %>%
mutate( topic = paste0("topic",topic)) %>% #根據主題名稱重新命名欄位
spread(topic, beta) %>%
filter( topic1 > .001 |topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))%>%
filter(abs(log_ratio) > 3.95) %>%
mutate(term = reorder(term, log_ratio)) %>% #依據log_ratio值(主題類別)排序詞項
ggplot(aes(log_ratio, term)) +
geom_col(show.legend = FALSE) 1.上圖中,左側為主題一,右側為主題二。
2.透過上方的兩張圖,感覺兩個主題看起來差不多,沒有明顯的差異,嘗試看看分多一點topics。
# ldas = c()
# topics = c(2,5,10,15,25)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(fish_dtm, k = topic, control = list(seed = 1234))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result")
# }因為需要執行較久,所以已將主題結果存在lda_result
topics = c(2,5,10,15,25)
data_frame(k = topics,
perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
perplexity 越小越好,但是太小的話,主題數會分太細。通常會找一個主題數適當,且perplexity比較低的主題。 因此,在後續分析時,本組將分為 “10個”主題。
fish_lda = ldas[[3]] ## 選定topic 為10 的結果
topics <- tidy(fish_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics## # A tibble: 1,104,560 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 九二共識 1.18e-24
## 2 2 九二共識 1.31e-15
## 3 3 九二共識 3.86e-14
## 4 4 九二共識 2.93e-30
## 5 5 九二共識 4.98e-24
## 6 6 九二共識 2.04e-21
## 7 7 九二共識 6.24e-19
## 8 8 九二共識 1.37e-16
## 9 9 九二共識 5.17e-29
## 10 10 九二共識 1.29e- 3
## # ... with 1,104,550 more rows
每一行代表一個主題中的一個詞彙
#取出每一個Topic中生成概率最高(beta值最高)的10個詞彙
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
#繪製長條圖
top_terms %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
scale_x_reordered()可以看到topic都被一開始所使用的搜尋關鍵字影響看不出每一群的差異。
remove_word = c("韓國瑜","高雄","總統","市長","韓粉","國民黨","韓導","韓總","一定","完全","比較","還要","表示","新聞","記者","去年","問題")
top_terms <- topics %>%
filter(!term %in% remove_word)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
scale_x_reordered()可以看出每個主題主要在討論什麼了!
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(fish_lda)
doc_pro <- tmResult$topics
dim(doc_pro) # nDocs(DTM) distributions over K topics## [1] 23735 10
每篇文章都有topic的分佈,所以總共是:23735筆的文章*10個主題
查看每一篇文章的各個主題組成比率
可以看到“民調:韓 V.S. 蔡、郭、柯”這個主題主要涵蓋了“韓、蔡、郭、柯的各式民調變動”。
fish_topic[,c(11:20)] =sapply(fish_topic[,c(11:20)] , as.numeric)
fish_topic %>%
group_by(artDate = format(artDate,"%Y%m")) %>%
summarise_if(is.double, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))由於我們在今年2月-4月的資料太少,所以將這三個月去除。
fish_topic %>%
group_by(artDate = format(artDate,"%Y%m")) %>%
filter(artDate < 202002) %>%
summarise_if(is.double, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))可以看出每個月的聲量,但是不能很清楚出每個月的比例
fish_topic %>%
group_by(artDate = format(artDate,"%Y%m")) %>%
filter(artDate < 202002) %>%
summarise_if(is.double, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))現在我們可以看到每個月主題的佔比了!