Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales','plotly')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
library(NLP)
require(jiebaR)
require(widyr)
require(ggraph)
require(igraph)
library(plotly)◆ 資料來源: 文字平台收集PTT 八卦版、股票版
◆ 資料集: 0611_articleMetaData.csv
◆ 關鍵字:國產疫苗、高端、聯亞、解盲
◆ 資料時間:2020-04-01 ~ 2021-06-11
◆
◆
◆ptt有些文章和主題不相關,篩選文章必要關鍵字。
# 把文章和留言讀進來
MetaData = fread('./data/0611_articleMetaData.csv',encoding = 'UTF-8')
Reviews = fread('./data/0611_articleReviews.csv',encoding = 'UTF-8')
# 再篩一次文章 2779 篇
keywords = c('國產疫苗','高端','聯亞','解盲')
toMatch = paste(keywords,collapse="|")
MetaData = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])%>%
#處理sentence欄位
mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>%
mutate(sentence=gsub("\n", "", sentence)) %>%
mutate(sentence=gsub("http(s)?[-:\\/B-Yb-y0-9\\.]+", " ", sentence))MetaData<- MetaData %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除", "", sentence))
# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")# 加入自定義的字典
jieba_tokenizer <- worker(user="./dict/user_dict.txt", stop_word = "./dict/stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}# 把文章和留言的斷詞結果併在一起
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
Rdata<- RToken
# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) ◆ 日期格式化 ◆ 去除特殊字元、詞頻太低的字
# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
data_select = data %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9b-y']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# 算每天不同字的詞頻
# word_count:artDate,word,count
word_count <- data_select %>%
select(artDate,word) %>%
group_by(artDate,word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count## # A tibble: 37,548 x 3
## # Groups: artDate [48]
## artDate word count
## <date> <chr> <int>
## 1 2021-05-30 疫苗 4348
## 2 2021-06-10 疫苗 3351
## 3 2021-05-31 疫苗 3175
## 4 2021-06-01 疫苗 2387
## 5 2021-05-30 國產 1868
## 6 2021-05-28 疫苗 1861
## 7 2021-06-10 高端 1828
## 8 2021-05-27 疫苗 1713
## 9 2021-05-30 高端 1652
## 10 2021-06-10 三期 1554
## # ... with 37,538 more rows
jieba_tokenizer = worker()
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}vac_bigram <- MetaData %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
vac_bigram## artTitle artDate artTime
## 1: Re:[爆卦]高端疫苗一期數據公佈 2021/04/07 00:58:44
## 2: Re:[爆卦]高端疫苗一期數據公佈 2021/04/07 00:58:44
## 3: Re:[爆卦]高端疫苗一期數據公佈 2021/04/07 00:58:44
## 4: Re:[爆卦]高端疫苗一期數據公佈 2021/04/07 00:58:44
## 5: Re:[爆卦]高端疫苗一期數據公佈 2021/04/07 00:58:44
## ---
## 391638: Re:[心得]高端記者會內容概要 2021/06/10 14:59:51
## 391639: Re:[心得]高端記者會內容概要 2021/06/10 14:59:51
## 391640: Re:[心得]高端記者會內容概要 2021/06/10 14:59:51
## 391641: Re:[心得]高端記者會內容概要 2021/06/10 14:59:51
## 391642: Re:[心得]高端記者會內容概要 2021/06/10 14:59:51
## artUrl artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html senafeld
## 2: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html senafeld
## 3: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html senafeld
## 4: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html senafeld
## 5: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html senafeld
## ---
## 391638: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html l75cm
## 391639: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html l75cm
## 391640: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html l75cm
## 391641: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html l75cm
## 391642: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html l75cm
## artCat commentNum push boo bigram
## 1: Gossiping 19 4 8 印度 阿三
## 2: Gossiping 19 4 8 阿三 我
## 3: Gossiping 19 4 8 我 只是
## 4: Gossiping 19 4 8 只是 好奇
## 5: Gossiping 19 4 8 好奇 ㄧ
## ---
## 391638: Stock 71 30 8 賭贏 恭喜
## 391639: Stock 71 30 8 恭喜 你
## 391640: Stock 71 30 8 你 賭輸
## 391641: Stock 71 30 8 賭輸 就
## 391642: Stock 71 30 8 就 願賭服輸
# 清除包含英文或數字的bigram組合
# 計算每個組合出現的次數
vac_bigram %>%
filter(!str_detect(bigram, regex("[0-9b-yB-Y]"))) %>%
count(bigram, sort = TRUE)## bigram n
## 1: 國產 疫苗 3359
## 2: 高端 疫苗 1199
## 3: 疫苗 的 944
## 4: 的 疫苗 896
## 5: 臨床 試驗 529
## ---
## 183755: 鑽石 稱作 1
## 183756: 籲 中央 1
## 183757: 籲 成立 1
## 183758: 籲推 疫苗 1
## 183759: 籲讓 會同 1
jieba_trigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
ngram<- ngrams(unlist(tokens), 3)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
}
})
}vac_trigram <- MetaData %>%
unnest_tokens(ngrams, sentence, token = jieba_trigram)
vac_trigram %>%
filter(!str_detect(ngrams, regex("[0-9b-yB-Y]"))) %>%
count(ngrams, sort = TRUE)## ngrams n
## 1: 國產 疫苗 的 306
## 2: 打 國產 疫苗 278
## 3: 陳 時 中 244
## 4: 的 國產 疫苗 187
## 5: 中 和 抗體 160
## ---
## 277101: 鑽石 稱作 血 1
## 277102: 籲 中央 讓 1
## 277103: 籲 成立 疫苗 1
## 277104: 籲推 疫苗 緊急 1
## 277105: 籲讓 會同 如果 1
# load stop words
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')## Warning in scan(file = "./dict/stop_words.txt", what = character(), sep =
## "\n", : 輸入連結 './dict/stop_words.txt' 中的輸入不正確
# bigram
vac_bigram %>%
filter(!str_detect(bigram, regex("[0-9b-yB-Y]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
count(word1, word2, sort = TRUE) %>%
unite_("bigram", c("word1","word2"), sep=" ")## bigram n
## 1: 國產 疫苗 3359
## 2: 高端 疫苗 1199
## 3: 臨床 試驗 529
## 4: 國外 疫苗 393
## 5: 莫 德納 345
## ---
## 107540: 鑽石 稱作 1
## 107541: 籲 中央 1
## 107542: 籲 成立 1
## 107543: 籲推 疫苗 1
## 107544: 籲讓 會同 1
# trigram
vac_trigram %>%
filter(!str_detect(ngrams, regex("[0-9b-yB-Y]"))) %>%
separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
count(word1, word2, word3, sort = TRUE) %>%
unite_("ngrams", c("word1", "word2", "word3"), sep=" ")## ngrams n
## 1: 陳 時 中 244
## 2: 三期 臨床 試驗 121
## 3: 緊急 使用 授權 100
## 4: 二期 臨床 試驗 98
## 5: 高端 聯 亞 89
## ---
## 101661: 趲 程 未有 1
## 101662: 鑽 中國 割 1
## 101663: 鑽石 稱作 血 1
## 101664: 籲 成立 疫苗 1
## 101665: 籲推 疫苗 緊急 1
依照N-gram產生出來的結果,可以建立更好的斷詞字典
全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版
分為正向情緒與負向情緒
P <- read_file("./dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("./dict/liwc/negative.txt") # 負向字典txt檔
#字典txt檔讀進來是一整個字串
#typeof(P)# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
# sentiment_count:artDate,sentiment,count
sentiment_count = data_select %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) ◆在畫出情緒之前,先看看每天的發文情形,大約在2021年5月底之後有較多的討論。
MetaData$artDate <- as.Date(MetaData$artDate,"%Y/%m/%d")
MetaData %>%
group_by(artDate) %>%
filter(artDate > "2021-05-01" && artDate < "2021-06-11")%>%
summarise(count = n()) %>%
ggplot(aes(artDate,count))+
geom_line(color="gray")+
geom_point(aes(colour = count))+
ggtitle("文章發表數量走勢圖") + xlab("Date") +
scale_x_date(date_breaks="7 days", date_labels="%m-%d") -> date_plot
ggplotly(date_plot)# 檢視資料的日期區間
#range(sentiment_count$artDate) "2021-04-06" "2021-06-10"
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-04-06','2021-06-10')),
date_breaks="7 days", date_labels="%m/%d")+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-05-30'))
[1]])),colour = "gray") +
ggtitle("正負情緒分數折線圖") + xlab("Date(2021)") +ylab("count") ->liwc_plot
ggplotly(liwc_plot)sentiment_count %>%
# 標準化
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-04-06','2021-06-10')),
date_breaks="7 days", date_labels="%m/%d") ->std_liwc
ggplotly(std_liwc)◆挑出幾個情緒高點的日期
1.2021-05-30
2.2021-06-10
# 查看每天的情緒分數排名
sentiment_count %>%
select(count,artDate) %>%
group_by(artDate) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))## # A tibble: 50 x 2
## artDate sum
## <date> <int>
## 1 2021-05-30 11229
## 2 2021-06-10 9074
## 3 2021-05-31 8040
## 4 2021-06-01 7575
## 5 2021-06-09 4638
## 6 2021-06-08 4250
## 7 2021-06-03 4184
## 8 2021-06-07 3975
## 9 2021-06-04 3855
## 10 2021-05-28 3653
## # ... with 40 more rows
挑出有興趣的日期,畫出文字雲看看都在討論甚麼主題。
先從2021-05-30的情緒高點看起,對應上面的情緒分析,並未出現太多的負面情緒字眼,可以看出大家對於國產疫苗的議題是抱持著理性的態度去談論的。 探討的內容大致上有,「國產疫苗該不該施打」、「三期測試還沒過就想申請EUA」等等
# 畫出文字雲
set.seed(817)
word_count %>%
filter(!(word %in% c("疫苗","國產","政府","高端"))) %>%
filter(artDate == as.Date('2021-05-30')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>50) %>% # 過濾出現太少次的字
wordcloud2(size = 1, color='random-light', backgroundColor="black",shape = "star")從結果來看,大家多半還是討論國產疫苗的議題較多,部過在文字雲中我們還是能看出些許的辱罵聲。
二期臨床試驗解盲成功
# 畫出文字雲
set.seed(1450)
word_count %>%
filter(!(word %in% c("疫苗","國產","政府","高端"))) %>%
filter(artDate == as.Date('2021-06-10')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>50) %>% # 過濾出現太少次的字
wordcloud2(size = 1, color='random-light', backgroundColor="black")
# 0610_wc.png在6/10政府宣布二期試驗解盲成功後,大家的討論比較偏向討論疫苗的保護力,以及國產疫苗有沒有辦法被國際認證。
# sentiment_sum:word,sentiment,sum
sentiment_sum <-
word_count %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame()
sentiment_sum %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip() -> plot
ggplotly(plot)sentiment_sum %>%
acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon1", "cyan3"), # positive negative
max.words = 100)之前的情緒分析大部分是全部的詞彙加總,接下來將正負面情緒的文章分開,看看能不能發現一些新的東西。接下來歸類文章,將每一篇文章正負面情緒的分數算出來,然後大概分類文章屬於正面還是負面。
# 依據情緒值的正負比例歸類文章
article_type =
data_select %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=n()) %>%
spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
mutate(type = case_when(positive > negative ~ "positive",
TRUE ~ "negative")) %>%
data.frame()
# 看一下正負比例的文章各有幾篇
article_type %>%
group_by(type) %>%
summarise(count = n())## # A tibble: 2 x 2
## type count
## <chr> <int>
## 1 negative 1658
## 2 positive 1071
把正面和負面的文章挑出來,並和斷詞結果合併。
# negative_article:artUrl,word
negative_article <-
article_type %>%
filter(type=="negative")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
# positive_article:artUrl,word
positive_article <-
article_type %>%
filter(type=="positive")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")畫出正負面文章情緒貢獻度較高的關鍵字
# 負面情緒關鍵字貢獻圖
negative_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to negative sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
vac_words <- data_select %>%
filter(!str_detect(word, regex("[0-9b-yB-Z]"))) %>%
count(artUrl, word, sort = TRUE)
vac_words## artUrl word n
## 1: https://www.ptt.cc/bbs/Gossiping/M.1623154883.A.820.html 蟑螂 350
## 2: https://www.ptt.cc/bbs/Gossiping/M.1622728885.A.239.html 希望 340
## 3: https://www.ptt.cc/bbs/Gossiping/M.1623321467.A.DC2.html az 226
## 4: https://www.ptt.cc/bbs/Gossiping/M.1622147019.A.075.html 疫苗 222
## 5: https://www.ptt.cc/bbs/Gossiping/M.1622078443.A.BBD.html 疫苗 209
## ---
## 536912: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 願賭服輸 1
## 536913: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 嚴重 1
## 536914: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 護航 1
## 536915: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 聽聽 1
## 536916: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 體量 1
total_words <- vac_words %>%
group_by(artUrl) %>%
summarize(total = sum(n))
vac_words <- left_join(vac_words, total_words)
vac_words## artUrl word n
## 1: https://www.ptt.cc/bbs/Gossiping/M.1623154883.A.820.html 蟑螂 350
## 2: https://www.ptt.cc/bbs/Gossiping/M.1622728885.A.239.html 希望 340
## 3: https://www.ptt.cc/bbs/Gossiping/M.1623321467.A.DC2.html az 226
## 4: https://www.ptt.cc/bbs/Gossiping/M.1622147019.A.075.html 疫苗 222
## 5: https://www.ptt.cc/bbs/Gossiping/M.1622078443.A.BBD.html 疫苗 209
## ---
## 536912: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 願賭服輸 1
## 536913: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 嚴重 1
## 536914: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 護航 1
## 536915: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 聽聽 1
## 536916: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 體量 1
## total
## 1: 1679
## 2: 3632
## 3: 1170
## 4: 4918
## 5: 4845
## ---
## 536912: 289
## 536913: 289
## 536914: 289
## 536915: 289
## 536916: 289
◆以每篇文章爲單位,計算每個詞彙的 tf-idf 值
vac_words_tf_idf <- vac_words %>%
bind_tf_idf(word, artUrl, n)
vac_words_tf_idf## artUrl word n
## 1: https://www.ptt.cc/bbs/Gossiping/M.1623154883.A.820.html 蟑螂 350
## 2: https://www.ptt.cc/bbs/Gossiping/M.1622728885.A.239.html 希望 340
## 3: https://www.ptt.cc/bbs/Gossiping/M.1623321467.A.DC2.html az 226
## 4: https://www.ptt.cc/bbs/Gossiping/M.1622147019.A.075.html 疫苗 222
## 5: https://www.ptt.cc/bbs/Gossiping/M.1622078443.A.BBD.html 疫苗 209
## ---
## 536912: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 願賭服輸 1
## 536913: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 嚴重 1
## 536914: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 護航 1
## 536915: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 聽聽 1
## 536916: https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 體量 1
## total tf idf tf_idf
## 1: 1679 0.208457415 1.55482161 0.324114094
## 2: 3632 0.093612335 1.48095704 0.138635846
## 3: 1170 0.193162393 1.07338445 0.207337508
## 4: 4918 0.045140301 0.07352686 0.003319025
## 5: 4845 0.043137255 0.07352686 0.003171747
## ---
## 536912: 289 0.003460208 5.73262185 0.019836062
## 536913: 289 0.003460208 2.14602125 0.007425679
## 536914: 289 0.003460208 1.88959172 0.006538380
## 536915: 289 0.003460208 4.56255060 0.015787372
## 536916: 289 0.003460208 7.92984643 0.027438915
# 選出每篇文章,tf-idf值最大的五個詞
vac_words_tf_idf %>%
group_by(artUrl) %>%
slice_max(tf_idf, n=5) %>%
arrange(desc(artUrl))## # A tibble: 16,698 x 7
## # Groups: artUrl [2,779]
## artUrl word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 https://www.ptt.cc/bbs/Stock/M.162333~ 李秉穎~ 5 289 0.0173 4.93 0.0854
## 2 https://www.ptt.cc/bbs/Stock/M.162333~ 笑秉 3 289 0.0104 6.54 0.0679
## 3 https://www.ptt.cc/bbs/Stock/M.162333~ 專業 8 289 0.0277 2.17 0.0600
## 4 https://www.ptt.cc/bbs/Stock/M.162333~ 中和性~ 3 289 0.0104 5.73 0.0595
## 5 https://www.ptt.cc/bbs/Stock/M.162333~ 血清 4 289 0.0138 4.24 0.0587
## 6 https://www.ptt.cc/bbs/Stock/M.162333~ 屁眼 23 629 0.0366 4.37 0.160
## 7 https://www.ptt.cc/bbs/Stock/M.162333~ 尊重 26 629 0.0413 3.76 0.155
## 8 https://www.ptt.cc/bbs/Stock/M.162333~ 大黑 10 629 0.0159 6.32 0.100
## 9 https://www.ptt.cc/bbs/Stock/M.162333~ 拍拍 6 629 0.00954 5.44 0.0519
## 10 https://www.ptt.cc/bbs/Stock/M.162333~ 給推 9 629 0.0143 3.42 0.0489
## # ... with 16,688 more rows
# 從每篇文章挑選出tf-idf最大的十個詞,
# 並計算每個詞被選中的次數
vac_words_tf_idf %>%
group_by(artUrl) %>%
slice_max(tf_idf, n=10) %>%
ungroup() %>%
count(word, sort=TRUE)## # A tibble: 20,688 x 2
## word n
## <chr> <int>
## 1 三期 105
## 2 二期 72
## 3 跌停 68
## 4 聯亞 64
## 5 出國 62
## 6 成功 60
## 7 失敗 57
## 8 抗體 51
## 9 解盲 49
## 10 中國 45
## # ... with 20,678 more rows
只使用鄉民的留言評論,以及計算名詞之間同時出現的字眼,一起來看看國產疫苗在鄉民眼中是如何的吧!
Rdata$artDate= Rdata$artDate %>% as.Date("%Y/%m/%d")
Rdata_select =
Rvac_words <- Rdata %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9b-y']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1) %>%
filter(!str_detect(word, regex("[0-9b-yB-Y]"))) %>%
count(artUrl, word, sort = TRUE)
Rtotal_words <- Rvac_words %>%
group_by(artUrl) %>%
summarize(total = sum(n))
# 合併需要的資料欄位
Rvac_words <- left_join(Rvac_words, Rtotal_words)vac_pairs <- Rvac_words %>%
pairwise_count(word, artUrl, sort = TRUE)
vac_pairs## # A tibble: 118,710,578 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 國產 疫苗 1452
## 2 疫苗 國產 1452
## 3 高端 疫苗 1418
## 4 疫苗 高端 1418
## 5 台灣 疫苗 1351
## 6 疫苗 台灣 1351
## 7 政府 疫苗 1185
## 8 疫苗 政府 1185
## 9 三期 疫苗 1158
## 10 疫苗 三期 1158
## # ... with 118,710,568 more rows
word_cors <- Rvac_words %>%
group_by(word) %>%
filter(n() >= 10) %>%
pairwise_cor(word, artUrl, sort = TRUE)
word_cors## # A tibble: 42,139,572 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 左手 右手 0.965
## 2 右手 左手 0.965
## 3 混為 一談 0.957
## 4 一談 混為 0.957
## 5 家園 非核 0.953
## 6 非核 家園 0.953
## 7 趕美 超英 0.914
## 8 超英 趕美 0.914
## 9 賢者 之石 0.912
## 10 之石 賢者 0.912
## # ... with 42,139,562 more rows
###共現關係圖 ◆使用詞彙關係圖畫出相關性大於0.5的組合
set.seed(2020)
word_cors %>%
filter(correlation > 0.7) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 2) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()