Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼(Windows系統可將這行註解)由於最近台灣的疫情延燒,使得疫苗的討論一直居高不下,而對於疫苗所衍生的多種面向,像是政治、情緒等,都是我們想深入了解的,因此本組針對內容做以下分析:情緒分析,文字雲,正負面詞的詞頻,社群網路圖..等,深入研究在不同時間、意見領袖或是新聞媒體對於疫苗的看法以及民眾對不同疫苗的正負面情緒。
安裝需要的packages和套件載入
packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr","wordcloud2","wordcloud","widyr","ggraph")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(readr)
library(data.table)
library(ggplot2)
library(dplyr)
library(jiebaR)
library(tidytext)
library(stringr)
library(tm)
library(topicmodels)
library(purrr)
require(RColorBrewer)
library(igraph)
library(tidyr)
library(scales)
library(wordcloud2)
library(wordcloud)
library(reshape2)
library(widyr)
library(ggraph)# vaccine <- read_csv("C:/Users/ooolivia/Desktop/NSYSU/109(2)/sma/final_prj/k/final_articleMetaData.csv") %>%
# mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>%
# mutate(sentence=gsub("\n", "", sentence)) %>%
# mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))
#
# vaccine
#
# reviews <- read_csv("C:/Users/ooolivia/Desktop/NSYSU/109(2)/sma/final_prj/k/final_articleReviews.csv")
# reviews = reviews%>%
# mutate(cmtContent=gsub("[\n]{2,}", "。", cmtContent)) %>%
# mutate(cmtContent=gsub("\n", "", cmtContent)) %>%
# mutate(cmtContent=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", cmtContent))
#
# reviews
load("vaccine_preprocess.rdata")# vaccine <- vaccine %>%
# mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除", "", sentence))nrow(vaccine)[1] 11588
nrow(reviews)[1] 860995
vaccine %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="blue")+
geom_point()
以五月底到六月初討論大幅上升
5/26、5/30、6/10 為高點
vaccine %>%
mutate(artDate = as.Date(artDate)) %>%
filter(artDate == "2021-05-26") %>%
head(15)
5/26 國產疫苗研發和採購疫苗為主
vaccine %>%
mutate(artDate = as.Date(artDate)) %>%
filter(artDate == "2021-05-30") %>%
head(15)
5/30 高端疫苗和郭台銘買疫苗為主
vaccine %>%
mutate(artDate = as.Date(artDate)) %>%
filter(artDate == "2021-06-10") %>%
head(15)
6/10 疫苗短缺問題
初始化斷詞器
# # 使用默認參數初始化一個斷詞引擎
# # 先不使用任何的字典和停用詞
# jieba_tokenizer <- worker(user="vaccine.txt", stop_word = "stop_words.txt")
#
# chi_tokenizer <- function(t) {
# lapply(t, function(x) {
# if(nchar(x)>1){
# tokens <- segment(x, jieba_tokenizer)
# # 去掉字串長度爲1的詞彙
# tokens <- tokens[nchar(tokens)>1]
# return(tokens)
# }
# })
# }斷詞與整理斷詞結果
# # 進行斷詞,並計算各詞彙在各文章中出現的次數
# vaccine_words <- vaccine %>%
# unnest_tokens(word, sentence, token=chi_tokenizer) %>%
# filter(!str_detect(word, regex("[0-9]"))) %>%
# count(artUrl, word, sort = TRUE)
# vaccine_words
#
# reviews_words <- reviews %>%
# unnest_tokens(word, cmtContent, token=chi_tokenizer) %>%
# filter(!str_detect(word, regex("[0-9]"))) %>%
# count(artUrl, word, sort = TRUE)
#
# # 把文章和留言的斷詞結果併在一起
# MToken <- vaccine %>% unnest_tokens(word, sentence, token=chi_tokenizer)
# RToken <- reviews %>% unnest_tokens(word, cmtContent, token=chi_tokenizer)
#
# # 把資料併在一起
data <- rbind(MToken[,c("artDate","artTitle","artUrl", "word")],RToken[,c("artDate","artTitle","artUrl", "word")])# # 格式化日期欄位
# data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
#
# # 過濾特殊字元
# data_select = data %>%
# filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
# filter(!grepl("['^0-9']",word)) %>% # 去英文、數字
# filter(nchar(.$word)>1)
#
# # 算每天不同字的詞頻
# # word_count:artDate,word,count
# word_count <- data_select %>%
# select(artDate, word) %>%
# group_by(artDate,word) %>%
# summarise(count=n()) %>% # 算字詞單篇總數用summarise
# filter(count>3) %>% # 過濾出現太少次的字
# arrange(desc(count))
#
# save.image(file = "vaccine_preprocess.rdata")
# word_count準備情緒字典
P <- read_file("./dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("./dict/liwc/negative.txt") # 負向字典txt檔
#字典txt檔讀進來是一整個字串
typeof(P)[1] "character"
分割字詞,並將兩個情緒字典併在一起
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)算出每章情緒總和(sentiment_count)
# sentiment_count:artDate,sentiment,count
sentiment_count = data_select %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) Joining, by = "word"
`summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
sentiment_count將情緒分數標準化後,可以發現雖然正負面情緒有波動,但大部分正負面情緒各半,約在5/21後負面情緒佔比較高。
sentiment_count %>%
# 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-05-11','2021-06-11'))
)+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-05-30'))
[1]])),colour = "yellow")我們挑出幾個情緒高點的日期 觀察每日情緒分數,約從5/24號開始議題被大量討論,6/4達到議題高峰,之後就慢慢下降。
# 查看每天的情緒分數排名
sentiment_count %>%
select(count,artDate) %>%
group_by(artDate) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))民眾對於民進黨的負面看法像是噁心、垃圾等等,高端國產疫苗未到三期,以及郭董買疫苗
# 畫出文字雲
word_count %>%
filter(!(word %in% c("疫苗","真的","已經","現在","台灣"))) %>%
filter(artDate == as.Date('2021-05-30')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>200) %>% # 過濾出現太少次的字
wordcloud2()Adding missing grouping variables: `artDate`
算出所有字詞的詞頻,找出情緒代表字: + 正面情緒:支持、相信、希望、成功 + 負面情緒:垃圾、問題、噁心、可憐
sentiment_sum_select <-
word_count %>%
filter(artDate == as.Date('2021-05-30')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() Joining, by = "word"
`summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum_select %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment 0530",
x = NULL) +
theme(text=element_text(size=14))+
theme(text = element_text(family = "Heiti TC Light"))+
coord_flip()之前的情緒分析大部分是全部的詞彙加總,接下來將正負面情緒的文章分開,可以發現文章屬於負面較多。
# 依據情緒值的正負比例歸類文章
article_type =
data_select %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=n()) %>%
spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
mutate(type = case_when(positive > negative ~ "positive",
TRUE ~ "negative")) %>%
data.frame() Joining, by = "word"
`summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
# 看一下正負比例的文章各有幾篇
article_type %>%
group_by(type) %>%
summarise(count = n())#
article_type_date = left_join(article_type[,c("artUrl", "type")], vaccine[,c("artUrl", "artDate")], by = "artUrl")
article_type_date %>%
group_by(artDate,type) %>%
summarise(count = n()) %>%
ggplot(aes(x = artDate, y = count, fill = type)) +
geom_bar(stat = "identity", position = "dodge")+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-05-10','2021-06-07'))
)`summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
可以看到在約5/24號之後,負面文章增加數量較多。
把正面和負面的文章挑出來,並和斷詞結果合併。
# negative_article:artUrl,word
negative_article <-
article_type %>%
filter(type=="negative")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
# positive_article:artUrl,word
positive_article <-
article_type %>%
filter(type=="positive")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")將資料轉換為Document Term Matrix (DTM)
dtm <- vaccine_words %>% cast_dtm(artUrl, word, n)
dtm<<DocumentTermMatrix (documents: 11588, terms: 63316)>>
Non-/sparse entries: 583543/733122265
Sparsity : 100%
Maximal term length: 59
Weighting : term frequency (tf)
inspect(dtm[1:10,1:10])<<DocumentTermMatrix (documents: 10, terms: 10)>>
Non-/sparse entries: 50/50
Sparsity : 50%
Maximal term length: 3
Weighting : term frequency (tf)
Sample :
Terms
Docs 復星 美國 台灣 細胞 疫苗 印度 造謠 中國 bnt the
https://www.ptt.cc/bbs/Gossiping/M.1621518888.A.D0E.html 0 16 0 6 86 0 0 22 0 0
https://www.ptt.cc/bbs/Gossiping/M.1621943637.A.3CE.html 0 40 0 0 95 6 0 79 0 0
https://www.ptt.cc/bbs/Gossiping/M.1622115922.A.734.html 42 4 27 0 61 0 0 16 26 0
https://www.ptt.cc/bbs/Gossiping/M.1622256534.A.708.html 0 1 46 0 59 0 0 31 1 0
https://www.ptt.cc/bbs/Gossiping/M.1622419101.A.CE5.html 1 10 55 0 64 0 0 29 0 0
https://www.ptt.cc/bbs/Gossiping/M.1622471096.A.28E.html 0 7 29 0 75 1 0 6 1 0
https://www.ptt.cc/bbs/Gossiping/M.1622476666.A.1DB.html 0 7 28 0 70 1 0 6 1 1
https://www.ptt.cc/bbs/Gossiping/M.1622721456.A.FBF.html 0 6 13 0 65 3 0 3 0 0
https://www.ptt.cc/bbs/Gossiping/M.1623170954.A.83B.html 0 20 27 0 84 1 0 1 3 0
https://www.ptt.cc/bbs/Gossiping/M.1623416895.A.409.html 0 0 0 64 1 0 0 0 0 0
嘗試3、4、5、6、7個主題數,將結果存起來,再做進一步分析。 (此部分需要跑一段時間,已經將跑完的檔案存成ldas_result.rdata,可以直接載入)
# ldas = c()
# topics = c(3,4,5,6,7)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(dtm, k = topic, control = list(seed = 2021))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result_.rdata") # 將模型輸出成檔案
# }載入先前跑好的每個主題的LDA結果
load("ldas_result_.rdata")topics = c(3,4,5,6,7)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
主題數愈多、複雜度愈低、內容的純度愈高。
挑選下降幅度趨緩的轉折點。
create LDAvis所需的json function 此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。
topicmodels_json_ldavis <- function(fitted, doc_term){
require(LDAvis)
require(slam)
###以下function 用來解決,主題數多會出現NA的問題
### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi){
jensenShannon <- function(x, y) {
m <- 0.5 * (x + y)
lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
0.5 * sum(lhs) + 0.5 * sum(rhs)
}
dist.mat <- proxy::dist(x = phi, method = jensenShannon)
pca.fit <- stats::cmdscale(dist.mat, k = 2)
data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(doc_term)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
doc.length = as.vector(table(doc_term$i)),
term.frequency = term_freq, mds.method = ls_LDA)
return(json_lda)
}mac表現的部分
# the_lda = ldas[[3]]
# json_res <- topicmodels_json_ldavis(the_lda,dtm)
# serVis(json_res,open.browser = T)# serVis(json_res, out.dir = "vis", open.browser = T)
# writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))
我們同時也用了LDAVis找了最佳主題數,想要再次驗證5個主題是很分開,證明可採用5個主題數。
lamda值選0.1,因為愈小的值,字愈獨特。
the_lda = ldas[[3]]topics_words <- tidy(the_lda, matrix = "beta") #注意!在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)topics_words %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
theme(text = element_text(family = "Heiti TC Light"))+
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()removed_word = c("疫苗","台灣","my","疫情","sent","jptt","on","from","國家","國產","問題","中國","有沒有","不到","高端","國外","購買")
topics_words %>%
filter(!term %in% removed_word) %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
theme(text = element_text(family = "Heiti TC Light"))+
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()# topics_name = c("各家疫苗比較","國產疫苗研發試驗","國產疫苗背後的政治","疫苗接種","疫苗採購")
topics_name = c("各國贈送疫苗","疫苗研發技術試驗","國產疫苗背後的政治","疫苗接種","疫苗採購")Document 主題分佈
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[vaccine$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
ptt_topic = cbind(vaccine,document_topics_df)
# 刪除commentNum、push、boo欄位
ptt_topic$commentNum = NULL
ptt_topic$push = NULL
ptt_topic$boo = NULL透過找到特定文章的分佈進行排序之後,可以看到此主題的比重高的文章在討論什麼,也可以依據文章內容來調整命名。
ptt_topic %>%
arrange(desc(`各國贈送疫苗`)) %>%head(10)
ptt_topic %>%
arrange(desc(`疫苗研發技術試驗`)) %>%head(10)
ptt_topic %>%
arrange(desc(`國產疫苗背後的政治`)) %>%head(10)
ptt_topic %>%
arrange(desc(`疫苗接種`)) %>%head(10)
ptt_topic %>%
arrange(desc(`疫苗採購`)) %>%head(10)# 更改主題1、2名稱
# topics_name = c("各國贈送疫苗","疫苗研發技術試驗","國產疫苗背後的政治","疫苗接種","疫苗採購")#去除筆數<300
ptt_topic %>%
mutate(artDate = as.Date(artDate)) %>%
filter( !format(artDate,'%Y%m%d') %in% c(20210511, 20210512, 20210513, 20210514, 20210515, 20210516, 20210517, 20210518, 20210519,20210520,20210521,20210522,20210523,20210524)) %>%
group_by(artDate = format(artDate,'%Y%m%d')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
theme(text = element_text(family = "Heiti TC Light"))+
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374","red"))+
theme(axis.text.x = element_text(angle = 90, hjust = 1))The melt generic in data.table has been passed a tbl_df and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.). In the next version, this warning will become an error.
ptt_topic %>%
mutate(artDate = as.Date(artDate)) %>%
filter( !format(artDate,'%Y%m') %in% c(20210511, 20210512, 20210513, 20210514, 20210515, 20210516, 20210517, 20210518, 20210519,20210520,20210521,20210522,20210523,20210524)) %>%
group_by(artDate = format(artDate,'%Y%m%d')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
theme(text = element_text(family = "Heiti TC Light"))+
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374","red"))+
theme(axis.text.x = element_text(angle = 90, hjust = 1))The melt generic in data.table has been passed a tbl_df and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.). In the next version, this warning will become an error.
可以看出國際疫苗背後的政治一直是個文章討論的重點
removed_word = c("疫苗","台灣","my","疫情","sent","jptt","on","from","國家","國產","問題","中國","有沒有","不到","高端","國外","購買")
phi_m <- topics_words %>%
filter(!term %in% removed_word) %>%
arrange(desc(phi)) %>%
top_n(70)Selecting by phi
dtm_ <-phi_m %>% cast_dtm(topic, term, phi)
dtmm<-as.matrix(dtm_)
dim(dtmm)[1] 5 54
network=graph_from_incidence_matrix(dtmm)
# plot
set.seed(3)
plot(network, ylim=c(-1,1), xlim=c(-1,1), asp = 0,
vertex.label.cex=0.7,vertex.size=10,vertex.label.family = "Heiti TC Light")每篇文章拿gamma值最大的topic當該文章的topic
# 在tidy function中使用參數"gamma"來取得 theta矩陣
vaccine_topics <- tidy(the_lda, matrix="gamma") %>% # document topic gamma
group_by(document) %>%
top_n(1, wt=gamma)
vaccine_topics資料合併(文章和留言)
# 文章和留言
reviews <- reviews %>%
select(artUrl, cmtPoster, cmtStatus, cmtContent)
posts_Reviews <- merge(x = vaccine, y = reviews, by = "artUrl")
# 把文章和topic合併
posts_Reviews <- merge(x = posts_Reviews, y = vaccine_topics, by.x = "artUrl", by.y="document")
# head(posts_Reviews,3)取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- posts_Reviews %>% select(cmtPoster, artPoster, artUrl)
head(link,3)資料篩選的方式: + 文章: 文章日期、留言數(commentNum) + link、node: degree
# 看留言數大概都多少(方便後面篩選)
vaccine %>%
filter(commentNum>100) %>%
ggplot(aes(x=commentNum)) + geom_histogram()
文章回覆數量在500以後就變少,所以可以抓500為斷點。
依據發文數或回覆數篩選post和review
# 帳號發文篇數
post_count = vaccine %>%
group_by(artPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
post_count# 帳號回覆總數
review_count = reviews %>%
group_by(cmtPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
review_count# 發文者
poster_select <- post_count %>% filter(count >= 10)
vaccine <- vaccine %>% filter(vaccine$artPoster %in% poster_select$artPoster)
# 回覆者
reviewer_select <- review_count %>% filter(count >= 300)
reviews <- reviews %>% filter(reviews$cmtPoster %in% reviewer_select$cmtPoster)# 檢視參與人數
length(unique(posts_Reviews$artPoster)) # 發文者數量 4751[1] 4571
length(unique(posts_Reviews$cmtPoster)) # 回覆者數量 48293[1] 48293
allPoster <- c(posts_Reviews$artPoster, posts_Reviews$cmtPoster) # 總參與人數 49391
length(unique(allPoster))[1] 49391
標記所有出現過的使用者
poster:只發過文、發過文+留過言
replyer:只留過言
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%vaccine$artPoster, "poster", "replyer"))
head(userList,3)link = posts_Reviews %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>15) %>%
filter(commentNum > 500) %>%
filter(topic == 2 | topic == 3) %>%
select(cmtPoster, artPoster, artUrl, topic) %>%
unique()
linkfiltered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
head(filtered_user,3)filter_degree = 5
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "red", "blue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "2", "palevioletred", "lightgreen")
# 畫出社群網路圖(degree>5的才畫出來)
set.seed(5432)
par(family=('Heiti TC Light'))
plot(reviewNetwork, vertex.size=5, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("red","blue"), pt.cex=1, cex=1)legend("topleft", c("疫苗研發技術試驗","國產疫苗背後的政治"),
col=c("palevioletred", "lightgreen"), lty=1, cex=0.6)可以找出:
主題2「疫苗研發技術試驗」的主要發文者有:CavendishJr
主題3「國產疫苗背後的政治」主題的主要發文者有:COCOCCC
「疫苗研發技術試驗」的主要回文者有:JustinXD、wet
「國產疫苗背後的政治」主題的主要回文者有:yule1224、braveryhyde、weakerman、smallminhaha、AASoymilk
rasiel0919則是兩種主題都有回覆
Vac_leader_data <- posts_Reviews %>%
filter((artPoster == "CavendishJr")|(artPoster == "COCOCCC"))
Vac_leader_data$artDate = as.Date(Vac_leader_data$artDate)
Vac_leader_data = Vac_leader_data %>% mutate(months = as.Date(cut(artDate, "months")))Vac_CavendishJr <- subset(Vac_leader_data,artPoster == "CavendishJr")
Vac_COCOCCC <- subset(Vac_leader_data,artPoster == "COCOCCC")
VacRW_CavendishJr <- subset(Vac_leader_data,artPoster == "CavendishJr")
VacRW_COCOCCC <- subset(Vac_leader_data,artPoster == "COCOCCC")CavendishJr文章中詞彙相關性的前處理
Vac1_CavendishJr <- Vac_CavendishJr %>%
select(artUrl,sentence)
Vac1_CavendishJr <- strsplit(Vac1_CavendishJr$sentence,"[。!;?!?;]")
# 將每個句子和相對應的文章配對起來,整理成 dataframe
Vac1_CavendishJr <- data.frame(
artUrl = rep(Vac_CavendishJr$artUrl, sapply(Vac1_CavendishJr, length)),
sentence = unlist(Vac1_CavendishJr))
Vac1_CavendishJr$sentence <- as.character(Vac1_CavendishJr$sentence)
jieba_tokenizer <- worker(user="vaccine.txt", stop_word = "stop_words.txt")
CavendishJr_word <- Vac1_CavendishJr %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
count(artUrl, word, sort = TRUE)# 計算兩個詞彙間的相關性
CavendishJr_word_cors <- CavendishJr_word %>%
group_by(word) %>%
filter(n() >= 3) %>%
pairwise_cor(word, artUrl, sort = TRUE)
CavendishJr_word_cors %>%
filter(correlation > 0.6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()主題2「疫苗研發技術試驗」的主要發文者有:CavendishJr
CavendishJr_sen_plo文章中,發文者和回文者情緒分析之資料前處理 準備CavendishJr的字典
CavendishJr_LIWC <- CavendishJr_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n))Joining, by = "word"
`summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
整理Reivew-CavendishJr
CavendishJr_LIWC <-CavendishJr_LIWC%>%
spread(sentiment, count, fill = 0)
cont_VacRW_CavendishJr <- VacRW_CavendishJr %>%
select(artUrl,cmtContent)
cont_VacRW_CavendishJr<-strsplit(cont_VacRW_CavendishJr$cmtContent,"[。!;?!?;]")
# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
cont_VacRW_CavendishJr <- data.frame(
artUrl = rep(VacRW_CavendishJr$artUrl, sapply(cont_VacRW_CavendishJr, length)),
cmtContent = unlist(cont_VacRW_CavendishJr)) %>%
filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
cont_VacRW_CavendishJr$cmtContent <- as.character(cont_VacRW_CavendishJr$cmtContent)
# 進行斷詞,並計算各詞彙在各文章中出現的次數
cont_VacRW_CavendishJr_word <- cont_VacRW_CavendishJr %>%
unnest_tokens(word, cmtContent, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
VacRW_CavendishJr_LIWC <- cont_VacRW_CavendishJr_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n)) %>% spread(sentiment, count, fill = 0)Joining, by = "word"
`summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
將CavendishJr 發文和回覆的情緒合併起來
VacRW_CavendishJr_LIWC <- VacRW_CavendishJr_LIWC %>%
mutate(source="review")
CavendishJr_LIWC <- CavendishJr_LIWC %>%
mutate(source="article")
CavendishJr_cmt_sen <-
bind_rows(x = CavendishJr_LIWC, y = VacRW_CavendishJr_LIWC)
CavendishJr_cmt_sen
CavendishJr_sen_plot <- CavendishJr_cmt_sen %>%
gather(sentiment,article_number,-source,-artUrl) %>%
group_by(artUrl,source) %>%
mutate(total_article =sum(article_number),ratio=article_number/total_article) %>%
group_by(artUrl) #發文/回復的文章數
CavendishJr_sen_plot %>%
ggplot( aes(x=as.factor(artUrl), y=ratio, fill=sentiment)) +
geom_bar(stat = "identity") + ylab("proportion") +
#theme(axis.text.x = element_text(angle = 90, hjust = 1))+
theme(text=element_text(family = "Heiti TC Light",size=12))+
facet_wrap(~source, ncol = 1)x是每篇文章,每篇文章的正負面情緒,比較發文者與回覆者的情緒,positive文章的發文者與回覆者的情緒趨勢普遍沒有一致。
CavendishJr_word %>%
filter(!word %in% '疫苗') %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 3) %>%
arrange(desc(sum)) %>%
wordcloud2()COCOCCC文章中詞彙相關性的前處理
Vac1_COCOCCC <- Vac_COCOCCC %>%
select(artUrl,sentence)
Vac1_COCOCCC <- strsplit(Vac1_COCOCCC$sentence,"[。!;?!?;]")
# 將每個句子和相對應的文章配對起來,整理成 dataframe
Vac1_COCOCCC <- data.frame(
artUrl = rep(Vac_COCOCCC$artUrl, sapply(Vac1_COCOCCC, length)),
sentence = unlist(Vac1_COCOCCC))
Vac1_COCOCCC$sentence <- as.character(Vac1_COCOCCC$sentence)
jieba_tokenizer <- worker(user="vaccine.txt", stop_word = "stop_words.txt")
COCOCCC_word <- Vac1_COCOCCC %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
count(artUrl, word, sort = TRUE)# 計算兩個詞彙間的相關性
COCOCCC_word_cors <- COCOCCC_word %>%
group_by(word) %>%
filter(n() >= 2) %>%
pairwise_cor(word, artUrl, sort = TRUE)
COCOCCC_word_cors %>%
filter(correlation > 0.6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()主題3「國產疫苗背後的政治」主題的主要發文者有:COCOCCC
…中間帶有政治色彩的…
COCOCCC文章中,發文者和回文者情緒分析之資料前處理
準備COCOCCC的字典
COCOCCC_LIWC <- COCOCCC_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n))Joining, by = "word"
`summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
整理Reivew-COCOCCC
COCOCCC_LIWC <-COCOCCC_LIWC%>%
spread(sentiment, count, fill = 0)
cont_VacRW_COCOCCC <- VacRW_COCOCCC %>%
select(artUrl,cmtContent)
cont_VacRW_COCOCCC<-strsplit(cont_VacRW_COCOCCC$cmtContent,"[。!;?!?;]")
# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
cont_VacRW_COCOCCC <- data.frame(
artUrl = rep(VacRW_COCOCCC$artUrl, sapply(cont_VacRW_COCOCCC, length)),
cmtContent = unlist(cont_VacRW_COCOCCC)) %>%
filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
cont_VacRW_COCOCCC$cmtContent <- as.character(cont_VacRW_COCOCCC$cmtContent)
# 進行斷詞,並計算各詞彙在各文章中出現的次數
cont_VacRW_COCOCCC_word <- cont_VacRW_COCOCCC %>%
unnest_tokens(word, cmtContent, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
VacRW_COCOCCC_LIWC <- cont_VacRW_COCOCCC_word %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=sum(n)) %>% spread(sentiment, count, fill = 0)Joining, by = "word"
`summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
將COCOCCC 發文和回覆的情緒合併起來
VacRW_COCOCCC_LIWC <- VacRW_COCOCCC_LIWC %>%
mutate(source="review")
COCOCCC_LIWC <- COCOCCC_LIWC %>%
mutate(source="article")
COCOCCC_cmt_sen <-
bind_rows(x = COCOCCC_LIWC, y = VacRW_COCOCCC_LIWC)
COCOCCC_cmt_sen
COCOCCC_sen_plot <- COCOCCC_cmt_sen %>%
gather(sentiment,article_number,-source,-artUrl) %>%
group_by(artUrl,source) %>%
mutate(total_article =sum(article_number),ratio=article_number/total_article) %>%
group_by(artUrl) #發文/回復的文章數
COCOCCC_sen_plot %>%
ggplot( aes(x=as.factor(artUrl), y=ratio, fill=sentiment)) +
geom_bar(stat = "identity") + ylab("proportion") +
#theme(axis.text.x = element_text(angle = 90, hjust = 1))+
theme(text=element_text(family = "Heiti TC Light",size=12))+
facet_wrap(~source, ncol = 1)
發文者正面情緒較多,而底下的留言者負面情緒較高。
COCOCCC_word %>%
filter(!word %in% '疫苗') %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum > 1) %>%
arrange(desc(sum)) %>%
wordcloud2()發文者對各家疫苗的正負情緒
# vaccine_BNT =
# vaccine_words %>%
# filter(word == "BNT" | word == "輝瑞疫苗"| word == "bnt") %>%
# mutate(vaccine='BNT')
vaccine_national =
vaccine_words %>%
filter(word == "國產疫苗" | word == "高端疫苗"| word == "聯亞疫苗") %>%
inner_join(article_type) %>%
mutate(vaccine='national') %>%
group_by(vaccine,type)%>%
summarise(count=n())Joining, by = "artUrl"
`summarise()` has grouped output by 'vaccine'. You can override using the `.groups` argument.
vaccine_BNT =
vaccine_words %>%
filter(word == "BNT" | word == "輝瑞疫苗"| word == "bnt") %>%
inner_join(article_type) %>%
mutate(vaccine='BNT') %>%
group_by(vaccine,type)%>%
summarise(count=n())Joining, by = "artUrl"
`summarise()` has grouped output by 'vaccine'. You can override using the `.groups` argument.
vaccine_moderna =
vaccine_words %>%
filter(word == "莫德納" | word == "moderna") %>%
inner_join(article_type) %>%
mutate(vaccine='moderna') %>%
group_by(vaccine,type)%>%
summarise(count=n())Joining, by = "artUrl"
`summarise()` has grouped output by 'vaccine'. You can override using the `.groups` argument.
vaccine_az =
vaccine_words %>%
filter(word == "AZ" | word == "az"| word == "Az"| word == "阿斯特捷利康") %>%
inner_join(article_type) %>%
mutate(vaccine='az') %>%
group_by(vaccine,type)%>%
summarise(count=n())Joining, by = "artUrl"
`summarise()` has grouped output by 'vaccine'. You can override using the `.groups` argument.
vaccine_jj =
vaccine_words %>%
filter(word == "嬌生" | word == "Johnson & Johnson") %>%
inner_join(article_type) %>%
mutate(vaccine='jj') %>%
group_by(vaccine,type)%>%
summarise(count=n())Joining, by = "artUrl"
`summarise()` has grouped output by 'vaccine'. You can override using the `.groups` argument.
#合併各個疫苗的發文文章
vaccine_filter = rbind(vaccine_national,vaccine_BNT,vaccine_moderna,vaccine_az,vaccine_jj)# vaccine_filter %>%
# ggplot( aes(x=vaccine, y=count, fill=type)) +
# geom_bar(stat = "identity") + ylab("count") +
# scale_fill_manual(values=c("#145374","red"))+
# theme(axis.text.x = element_text(angle = 90, hjust = 1)) # vaccine_filter %>%
# mutate(total_count =sum(count))%>%
# ggplot( aes(x=vaccine, y=count/total_count, fill=type)) +
# geom_bar(stat = "identity") + ylab("proportion") +
# scale_fill_manual(values=c("red","#145374"))+
# theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
# geom_hline(aes(yintercept=0.5), colour="white",lwd=2)
發文者在談論各家疫苗時,情緒都偏中立,說明發文者是以較客觀的立場來談論這次事件。
留言者對各家疫苗的正負情緒
reviews_type =
reviews_words %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=n()) %>%
spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
mutate(type = case_when(positive > negative ~ "positive",
TRUE ~ "negative")) %>%
data.frame()Joining, by = "word"
`summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
names(reviews_type)[2]="reviews_negative"
names(reviews_type)[3]="reviews_positive"vaccine_national_url =
vaccine_words %>%
filter(word == "國產疫苗" | word == "高端疫苗"| word == "聯亞疫苗") %>%
inner_join(article_type) %>%
mutate(vaccine='national')Joining, by = "artUrl"
vaccine_BNT_url =
vaccine_words %>%
filter(word == "BNT" | word == "輝瑞疫苗"| word == "bnt") %>%
inner_join(article_type) %>%
mutate(vaccine='BNT')Joining, by = "artUrl"
vaccine_moderna_url =
vaccine_words %>%
filter(word == "莫德納" | word == "moderna") %>%
inner_join(article_type) %>%
mutate(vaccine='moderna')Joining, by = "artUrl"
vaccine_az_url =
vaccine_words %>%
filter(word == "AZ" | word == "az"| word == "Az"| word == "阿斯特捷利康") %>%
inner_join(article_type) %>%
mutate(vaccine='az')Joining, by = "artUrl"
vaccine_jj_url =
vaccine_words %>%
filter(word == "嬌生" | word == "Johnson & Johnson") %>%
inner_join(article_type) %>%
mutate(vaccine='jj')Joining, by = "artUrl"
#合併各個疫苗的發文文章
vaccine_filter_url = rbind(vaccine_national_url,vaccine_BNT_url,vaccine_moderna_url,vaccine_az_url,vaccine_jj_url)
vaccine_arturl = inner_join(reviews_type,vaccine_filter_url,by="artUrl")vaccine_national_reviews =
vaccine_arturl %>%
filter(vaccine == "national") %>%
group_by(vaccine,type.x)%>%
summarise(count=n())`summarise()` has grouped output by 'vaccine'. You can override using the `.groups` argument.
vaccine_BNT_reviews =
vaccine_arturl %>%
filter(vaccine == "BNT") %>%
group_by(vaccine,type.x)%>%
summarise(count=n())`summarise()` has grouped output by 'vaccine'. You can override using the `.groups` argument.
vaccine_moderna_reviews =
vaccine_arturl %>%
filter(vaccine == "moderna") %>%
group_by(vaccine,type.x)%>%
summarise(count=n())`summarise()` has grouped output by 'vaccine'. You can override using the `.groups` argument.
vaccine_az_reviews =
vaccine_arturl %>%
filter(vaccine == "az") %>%
group_by(vaccine,type.x)%>%
summarise(count=n())`summarise()` has grouped output by 'vaccine'. You can override using the `.groups` argument.
vaccine_jj_reviews =
vaccine_arturl %>%
filter(vaccine == "jj") %>%
group_by(vaccine,type.x)%>%
summarise(count=n())`summarise()` has grouped output by 'vaccine'. You can override using the `.groups` argument.
#合併各個疫苗的留言
vaccine_filter_reviews = rbind(vaccine_national_reviews,vaccine_BNT_reviews,vaccine_moderna_reviews,vaccine_az_reviews,vaccine_jj_reviews)# vaccine_filter_reviews %>%
# ggplot( aes(x=vaccine, y=count, fill=type.x)) +
# geom_bar(stat = "identity") + ylab("count") +
# scale_fill_manual(values=c("#145374","red"))+
# theme(axis.text.x = element_text(angle = 90, hjust = 1)) vaccine_filter_reviews %>%
mutate(total_count =sum(count))%>%
ggplot( aes(x=vaccine, y=count/total_count, fill=type.x)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=c("red","#145374"))+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
geom_hline(aes(yintercept=0.5), colour="white",lwd=2)
反而留言者在談論各家疫苗時,情緒偏向負面,看待本次事件較悲觀。
載入東森新聞、聯合新聞、蘋果新聞
ettoday <- fread("東森新聞_articleMetaData.csv",encoding = "UTF-8") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>%
mutate(sentence=gsub("\n", "", sentence)) %>%
mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))
udn <- fread("聯合新聞_articleMetaData.csv",encoding = "UTF-8") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>%
mutate(sentence=gsub("\n", "", sentence)) %>%
mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))
apple <- fread("蘋果新聞_articleMetaData.csv",encoding = "UTF-8") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>%
mutate(sentence=gsub("\n", "", sentence)) %>%
mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))ettoday <- ettoday %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除", "", sentence))
udn <- udn %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除", "", sentence))
apple <- apple %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除", "", sentence))斷詞
jieba_tokenizer <- worker(user="vaccine.txt", stop_word = "stop_words.txt")
ettoday$sentence=ettoday$sentence %>% tolower()
ettoday_words <- ettoday %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!str_detect(word, regex("[0-9]"))) #去除數字
udn$sentence=udn$sentence %>% tolower()
udn_words <- udn %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!str_detect(word, regex("[0-9]"))) #去除數字
apple$sentence=apple$sentence %>% tolower()
apple_words <- apple %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!str_detect(word, regex("[0-9]"))) #去除數字LIWC$sentiment = ifelse(LIWC$sentimen=="positive",1,-1)計算情緒分數
sentiment_count = MToken %>%
select(artTitle,artDate,word) %>%
inner_join(LIWC) %>%
group_by(artTitle) %>%
mutate(P_N=ifelse(sum(sentiment)>0,"positive","negative")) %>%
filter(row_number()==1) %>%
group_by(artDate) %>%
summarise(P_number=sum(P_N=="positive"),
N_number=sum(P_N=="negative"),
article_n = n(),
p_ratio = P_number/article_n,
n_ratio = N_number/article_n,
source="ptt")Joining, by = "word"
sentiment_count=sentiment_count %>% select(artDate,p_ratio,n_ratio,source) %>% tidyr::gather(sentiment,ratio,-artDate,-source)ettoday_words = ettoday_words %>%
select(artTitle,artDate,word) %>%
inner_join(LIWC) %>%
group_by(artTitle) %>%
mutate(P_N=ifelse(sum(sentiment)>0,"positive","negative")) %>%
filter(row_number()==1) %>%
group_by(artDate) %>%
summarise(P_number=sum(P_N=="positive"),
N_number=sum(P_N=="negative"),
article_n = n(),
p_ratio = P_number/article_n,
n_ratio = N_number/article_n,
source="ettoday")Joining, by = "word"
ettoday_words=ettoday_words %>% select(artDate,p_ratio,n_ratio,source) %>% tidyr::gather(sentiment,ratio,-artDate,-source)udn_words = udn_words %>%
select(artTitle,artDate,word) %>%
inner_join(LIWC) %>%
group_by(artTitle) %>%
mutate(P_N=ifelse(sum(sentiment)>0,"positive","negative")) %>%
filter(row_number()==1) %>%
group_by(artDate) %>%
summarise(P_number=sum(P_N=="positive"),
N_number=sum(P_N=="negative"),
article_n = n(),
p_ratio = P_number/article_n,
n_ratio = N_number/article_n,
source="udn")Joining, by = "word"
udn_words=udn_words %>% select(artDate,p_ratio,n_ratio,source) %>% tidyr::gather(sentiment,ratio,-artDate,-source)apple_words = apple_words %>%
select(artTitle,artDate,word) %>%
inner_join(LIWC) %>%
group_by(artTitle) %>%
mutate(P_N=ifelse(sum(sentiment)>0,"positive","negative")) %>%
filter(row_number()==1) %>%
group_by(artDate) %>%
summarise(P_number=sum(P_N=="positive"),
N_number=sum(P_N=="negative"),
article_n = n(),
p_ratio = P_number/article_n,
n_ratio = N_number/article_n,
source="apple")Joining, by = "word"
apple_words=apple_words %>% select(artDate,p_ratio,n_ratio,source) %>% tidyr::gather(sentiment,ratio,-artDate,-source)ettoday_words$artDate=as.Date(ettoday_words$artDate)
udn_words$artDate=as.Date(udn_words$artDate)
apple_words$artDate=as.Date(apple_words$artDate)
all_data=bind_rows(sentiment_count,ettoday_words,udn_words,apple_words)```r
mycolors <- colorRampPalette(brewer.pal(8, \Set3\))(20)
all_data%>%
group_by(artDate)%>%
ggplot( aes(x=artDate, y=ratio, fill=sentiment )) +
geom_bar(stat = \identity\) + ylab(\proportion\) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
scale_fill_manual(values=mycolors[c(1,5,8,12)])+
scale_x_date(labels = scales::date_format(\%m/%d\) )+
facet_wrap(~source,ncol = 1, scales=\free_y\)```
整體而言,民眾在疫苗此議題上大多為負面情緒,且大多圍繞在疫苗背後的政治議題,可以了解人民對於政府的疫苗政策可能不是那麼滿意,希望政府能提供足夠疫苗,讓台灣儘早度過疫情難關。