1.套件載入及資料介紹

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

安裝需要的packages

# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales','plotly')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

讀進library

library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
library(NLP)
require(jiebaR)
require(widyr)
require(ggraph)
require(igraph)
library(plotly)

資料基本介紹

◆ 資料來源: 文字平台收集PTT 八卦版、股票版
◆ 資料集: 0611_articleMetaData.csv
◆ 關鍵字:國產疫苗、高端、聯亞、解盲
◆ 資料時間:2020-04-01 ~ 2021-06-11



研究動機:




讀進檔案&初步篩選

◆ptt有些文章和主題不相關,篩選文章必要關鍵字。

# 把文章和留言讀進來
MetaData = fread('./data/0611_articleMetaData.csv',encoding = 'UTF-8')
Reviews  = fread('./data/0611_articleReviews.csv',encoding = 'UTF-8')

# 再篩一次文章 2779 篇
keywords = c('國產疫苗','高端','聯亞','解盲')
toMatch = paste(keywords,collapse="|")
MetaData = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])%>%
#處理sentence欄位
  mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>% 
  mutate(sentence=gsub("\n", "", sentence)) %>% 
  mutate(sentence=gsub("http(s)?[-:\\/B-Yb-y0-9\\.]+", " ", sentence))

移除PTT貼新聞時會出現的格式用字

MetaData<- MetaData %>% 
  mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除", "", sentence))

# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")

2.資料前處理

文章斷詞

設定斷詞引擎

# 加入自定義的字典
jieba_tokenizer <- worker(user="./dict/user_dict.txt", stop_word = "./dict/stop_words.txt")

# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
# 把文章和留言的斷詞結果併在一起
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
Rdata<- RToken

# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 

資料清理

◆ 日期格式化 ◆ 去除特殊字元、詞頻太低的字

# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")

# 過濾特殊字元
data_select = data %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9b-y']",word)) %>% # 去英文、數字
  filter(nchar(.$word)>1) 
  
# 算每天不同字的詞頻
# word_count:artDate,word,count
word_count <- data_select %>%
  select(artDate,word) %>%
  group_by(artDate,word) %>%
  summarise(count=n()) %>%  # 算字詞單篇總數用summarise
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))

word_count
## # A tibble: 37,548 x 3
## # Groups:   artDate [48]
##    artDate    word  count
##    <date>     <chr> <int>
##  1 2021-05-30 疫苗   4348
##  2 2021-06-10 疫苗   3351
##  3 2021-05-31 疫苗   3175
##  4 2021-06-01 疫苗   2387
##  5 2021-05-30 國產   1868
##  6 2021-05-28 疫苗   1861
##  7 2021-06-10 高端   1828
##  8 2021-05-27 疫苗   1713
##  9 2021-05-30 高端   1652
## 10 2021-06-10 三期   1554
## # ... with 37,538 more rows

3.進行N-gram幫助建立使用者字典

Bigram

bigram function

jieba_tokenizer = worker()
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

執行bigram分詞

vac_bigram <- MetaData %>%
  unnest_tokens(bigram, sentence, token = jieba_bigram)
vac_bigram
##                              artTitle    artDate  artTime
##      1: Re:[爆卦]高端疫苗一期數據公佈 2021/04/07 00:58:44
##      2: Re:[爆卦]高端疫苗一期數據公佈 2021/04/07 00:58:44
##      3: Re:[爆卦]高端疫苗一期數據公佈 2021/04/07 00:58:44
##      4: Re:[爆卦]高端疫苗一期數據公佈 2021/04/07 00:58:44
##      5: Re:[爆卦]高端疫苗一期數據公佈 2021/04/07 00:58:44
##     ---                                                  
## 391638:   Re:[心得]高端記者會內容概要 2021/06/10 14:59:51
## 391639:   Re:[心得]高端記者會內容概要 2021/06/10 14:59:51
## 391640:   Re:[心得]高端記者會內容概要 2021/06/10 14:59:51
## 391641:   Re:[心得]高端記者會內容概要 2021/06/10 14:59:51
## 391642:   Re:[心得]高端記者會內容概要 2021/06/10 14:59:51
##                                                           artUrl artPoster
##      1: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html  senafeld
##      2: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html  senafeld
##      3: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html  senafeld
##      4: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html  senafeld
##      5: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html  senafeld
##     ---                                                                   
## 391638:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     l75cm
## 391639:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     l75cm
## 391640:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     l75cm
## 391641:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     l75cm
## 391642:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     l75cm
##            artCat commentNum push boo      bigram
##      1: Gossiping         19    4   8   印度 阿三
##      2: Gossiping         19    4   8     阿三 我
##      3: Gossiping         19    4   8     我 只是
##      4: Gossiping         19    4   8   只是 好奇
##      5: Gossiping         19    4   8     好奇 ㄧ
##     ---                                          
## 391638:     Stock         71   30   8   賭贏 恭喜
## 391639:     Stock         71   30   8     恭喜 你
## 391640:     Stock         71   30   8     你 賭輸
## 391641:     Stock         71   30   8     賭輸 就
## 391642:     Stock         71   30   8 就 願賭服輸

統計最常出現的bigram組合

# 清除包含英文或數字的bigram組合
# 計算每個組合出現的次數
vac_bigram %>%
  filter(!str_detect(bigram, regex("[0-9b-yB-Y]"))) %>%
  count(bigram, sort = TRUE)
##            bigram    n
##      1: 國產 疫苗 3359
##      2: 高端 疫苗 1199
##      3:   疫苗 的  944
##      4:   的 疫苗  896
##      5: 臨床 試驗  529
##     ---               
## 183755: 鑽石 稱作    1
## 183756:   籲 中央    1
## 183757:   籲 成立    1
## 183758: 籲推 疫苗    1
## 183759: 籲讓 會同    1

Trigram

Trigram function

jieba_trigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      ngram<- ngrams(unlist(tokens), 3)
      ngram <- lapply(ngram, paste, collapse = " ")
      unlist(ngram)
    }
  })
}

執行trigram分詞

vac_trigram <- MetaData %>%
  unnest_tokens(ngrams, sentence, token = jieba_trigram)
vac_trigram %>%
  filter(!str_detect(ngrams, regex("[0-9b-yB-Y]"))) %>%
  count(ngrams, sort = TRUE)
##                 ngrams   n
##      1:   國產 疫苗 的 306
##      2:   打 國產 疫苗 278
##      3:       陳 時 中 244
##      4:   的 國產 疫苗 187
##      5:     中 和 抗體 160
##     ---                   
## 277101:   鑽石 稱作 血   1
## 277102:     籲 中央 讓   1
## 277103:   籲 成立 疫苗   1
## 277104: 籲推 疫苗 緊急   1
## 277105: 籲讓 會同 如果   1

Remove stop words

# load stop words
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "./dict/stop_words.txt", what = character(), sep =
## "\n", : 輸入連結 './dict/stop_words.txt' 中的輸入不正確
# bigram
vac_bigram %>%
  filter(!str_detect(bigram, regex("[0-9b-yB-Y]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  count(word1, word2, sort = TRUE) %>%
  unite_("bigram", c("word1","word2"), sep=" ")
##            bigram    n
##      1: 國產 疫苗 3359
##      2: 高端 疫苗 1199
##      3: 臨床 試驗  529
##      4: 國外 疫苗  393
##      5:   莫 德納  345
##     ---               
## 107540: 鑽石 稱作    1
## 107541:   籲 中央    1
## 107542:   籲 成立    1
## 107543: 籲推 疫苗    1
## 107544: 籲讓 會同    1
# trigram
vac_trigram %>%
  filter(!str_detect(ngrams, regex("[0-9b-yB-Y]"))) %>%
  separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
  count(word1, word2, word3, sort = TRUE) %>%
  unite_("ngrams", c("word1", "word2", "word3"), sep=" ")
##                 ngrams   n
##      1:       陳 時 中 244
##      2: 三期 臨床 試驗 121
##      3: 緊急 使用 授權 100
##      4: 二期 臨床 試驗  98
##      5:     高端 聯 亞  89
##     ---                   
## 101661:     趲 程 未有   1
## 101662:     鑽 中國 割   1
## 101663:   鑽石 稱作 血   1
## 101664:   籲 成立 疫苗   1
## 101665: 籲推 疫苗 緊急   1

依照N-gram產生出來的結果,可以建立更好的斷詞字典

4.情緒分析

準備情緒字典(LIWC)

全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版
分為正向情緒與負向情緒

讀檔,字詞間以“,”隔開

P <- read_file("./dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("./dict/liwc/negative.txt") # 負向字典txt檔

#字典txt檔讀進來是一整個字串
#typeof(P)

分割字詞,並將兩個情緒字典合併

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)

# 檢視字典
head(LIWC)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

將文章與LIWC字典做join,算出每天情緒總和(sentiment_count)

# sentiment_count:artDate,sentiment,count
sentiment_count = data_select %>%
  select(artDate,word) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=n())  

文章發表數量走勢圖

◆在畫出情緒之前,先看看每天的發文情形,大約在2021年5月底之後有較多的討論。

MetaData$artDate <- as.Date(MetaData$artDate,"%Y/%m/%d")
MetaData %>% 
  group_by(artDate) %>% 
  filter(artDate > "2021-05-01" && artDate < "2021-06-11")%>%
  summarise(count = n()) %>%
   ggplot(aes(artDate,count))+
    geom_line(color="gray")+
    geom_point(aes(colour = count))+
  ggtitle("文章發表數量走勢圖") + xlab("Date") +
  scale_x_date(date_breaks="7 days", date_labels="%m-%d") -> date_plot
ggplotly(date_plot)

正負情緒發文折線圖

日期與正負情緒分數

正負情緒分數折線圖

# 檢視資料的日期區間
#range(sentiment_count$artDate) "2021-04-06" "2021-06-10"
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-04-06','2021-06-10')),
               date_breaks="7 days", date_labels="%m/%d")+
  # 加上標示日期的線
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-05-30'))
[1]])),colour = "gray") +
  ggtitle("正負情緒分數折線圖") + xlab("Date(2021)") +ylab("count") ->liwc_plot
ggplotly(liwc_plot)

標準化後正負情緒比例折線圖

sentiment_count %>% 
  # 標準化
  group_by(artDate) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-04-06','2021-06-10')),
               date_breaks="7 days", date_labels="%m/%d") ->std_liwc
  ggplotly(std_liwc)

◆挑出幾個情緒高點的日期
1.2021-05-30
2.2021-06-10

# 查看每天的情緒分數排名
sentiment_count %>%
  select(count,artDate) %>%
  group_by(artDate) %>%
  summarise(sum = sum(count)) %>%
  arrange(desc(sum))
## # A tibble: 50 x 2
##    artDate      sum
##    <date>     <int>
##  1 2021-05-30 11229
##  2 2021-06-10  9074
##  3 2021-05-31  8040
##  4 2021-06-01  7575
##  5 2021-06-09  4638
##  6 2021-06-08  4250
##  7 2021-06-03  4184
##  8 2021-06-07  3975
##  9 2021-06-04  3855
## 10 2021-05-28  3653
## # ... with 40 more rows

畫出文字雲

挑出有興趣的日期,畫出文字雲看看都在討論甚麼主題。

先從2021-05-30的情緒高點看起,對應上面的情緒分析,並未出現太多的負面情緒字眼,可以看出大家對於國產疫苗的議題是抱持著理性的態度去談論的。 探討的內容大致上有,「國產疫苗該不該施打」、「三期測試還沒過就想申請EUA」等等 05-30新聞

2021-05-30 文字雲

# 畫出文字雲
set.seed(817)
word_count %>% 
  filter(!(word %in% c("疫苗","國產","政府","高端"))) %>%
  filter(artDate == as.Date('2021-05-30')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  filter(count>50) %>%   # 過濾出現太少次的字
  
   wordcloud2(size = 1,  color='random-light', backgroundColor="black",shape = "star")

從結果來看,大家多半還是討論國產疫苗的議題較多,部過在文字雲中我們還是能看出些許的辱罵聲。

2021-06-10 文字雲

二期臨床試驗解盲成功

# 畫出文字雲
set.seed(1450)
word_count %>% 
  filter(!(word %in% c("疫苗","國產","政府","高端"))) %>%
  filter(artDate == as.Date('2021-06-10')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  filter(count>50) %>%   # 過濾出現太少次的字
   wordcloud2(size = 1,  color='random-light', backgroundColor="black")
# 0610_wc.png

在6/10政府宣布二期試驗解盲成功後,大家的討論比較偏向討論疫苗的保護力,以及國產疫苗有沒有辦法被國際認證。

找出情緒字典代表字

正負情緒代表字

# sentiment_sum:word,sentiment,sum
sentiment_sum <- 
  word_count %>%
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = sum(count)
  ) %>% 
  arrange(desc(sum)) %>%
  data.frame() 
  
sentiment_sum %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip() -> plot
ggplotly(plot)

正負情緒文字雲

sentiment_sum %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon1", "cyan3"), # positive negative
                   max.words = 100)

歸類正負面文章

之前的情緒分析大部分是全部的詞彙加總,接下來將正負面情緒的文章分開,看看能不能發現一些新的東西。接下來歸類文章,將每一篇文章正負面情緒的分數算出來,然後大概分類文章屬於正面還是負面。

# 依據情緒值的正負比例歸類文章
article_type = 
  data_select %>%
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=n()) %>%
  spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
  mutate(type = case_when(positive > negative ~ "positive", 
                             TRUE ~ "negative")) %>%
  data.frame() 
  
# 看一下正負比例的文章各有幾篇
article_type %>%
  group_by(type) %>%
  summarise(count = n())
## # A tibble: 2 x 2
##   type     count
##   <chr>    <int>
## 1 negative  1658
## 2 positive  1071

把正面和負面的文章挑出來,並和斷詞結果合併。

# negative_article:artUrl,word
negative_article <-
article_type %>%
  filter(type=="negative")%>%
  select(artUrl) %>%
  left_join(data_select[,c("artUrl", "word")], by = "artUrl")

# positive_article:artUrl,word
positive_article <-
article_type %>%
  filter(type=="positive")%>%
  select(artUrl) %>%
  left_join(data_select[,c("artUrl", "word")], by = "artUrl")

畫出正負面文章情緒貢獻度較高的關鍵字

情緒關鍵字:負面情緒文章

# 負面情緒關鍵字貢獻圖
negative_article %>%
inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = n()
    )%>% 
  arrange(desc(sum)) %>%
  data.frame() %>%
  top_n(30,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to negative sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

5.TF-IDF

計算各詞彙在各文章中出現的次數

vac_words <- data_select %>%
  filter(!str_detect(word, regex("[0-9b-yB-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
vac_words
##                                                           artUrl     word   n
##      1: https://www.ptt.cc/bbs/Gossiping/M.1623154883.A.820.html     蟑螂 350
##      2: https://www.ptt.cc/bbs/Gossiping/M.1622728885.A.239.html     希望 340
##      3: https://www.ptt.cc/bbs/Gossiping/M.1623321467.A.DC2.html       az 226
##      4: https://www.ptt.cc/bbs/Gossiping/M.1622147019.A.075.html     疫苗 222
##      5: https://www.ptt.cc/bbs/Gossiping/M.1622078443.A.BBD.html     疫苗 209
##     ---                                                                      
## 536912:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 願賭服輸   1
## 536913:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     嚴重   1
## 536914:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     護航   1
## 536915:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     聽聽   1
## 536916:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     體量   1

合併需要的資料欄位

total_words <- vac_words %>% 
  group_by(artUrl) %>% 
  summarize(total = sum(n))
vac_words <- left_join(vac_words, total_words)
vac_words
##                                                           artUrl     word   n
##      1: https://www.ptt.cc/bbs/Gossiping/M.1623154883.A.820.html     蟑螂 350
##      2: https://www.ptt.cc/bbs/Gossiping/M.1622728885.A.239.html     希望 340
##      3: https://www.ptt.cc/bbs/Gossiping/M.1623321467.A.DC2.html       az 226
##      4: https://www.ptt.cc/bbs/Gossiping/M.1622147019.A.075.html     疫苗 222
##      5: https://www.ptt.cc/bbs/Gossiping/M.1622078443.A.BBD.html     疫苗 209
##     ---                                                                      
## 536912:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 願賭服輸   1
## 536913:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     嚴重   1
## 536914:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     護航   1
## 536915:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     聽聽   1
## 536916:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     體量   1
##         total
##      1:  1679
##      2:  3632
##      3:  1170
##      4:  4918
##      5:  4845
##     ---      
## 536912:   289
## 536913:   289
## 536914:   289
## 536915:   289
## 536916:   289

計算 tf-idf 值

◆以每篇文章爲單位,計算每個詞彙的 tf-idf 值

vac_words_tf_idf <- vac_words %>%
  bind_tf_idf(word, artUrl, n)
vac_words_tf_idf
##                                                           artUrl     word   n
##      1: https://www.ptt.cc/bbs/Gossiping/M.1623154883.A.820.html     蟑螂 350
##      2: https://www.ptt.cc/bbs/Gossiping/M.1622728885.A.239.html     希望 340
##      3: https://www.ptt.cc/bbs/Gossiping/M.1623321467.A.DC2.html       az 226
##      4: https://www.ptt.cc/bbs/Gossiping/M.1622147019.A.075.html     疫苗 222
##      5: https://www.ptt.cc/bbs/Gossiping/M.1622078443.A.BBD.html     疫苗 209
##     ---                                                                      
## 536912:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html 願賭服輸   1
## 536913:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     嚴重   1
## 536914:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     護航   1
## 536915:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     聽聽   1
## 536916:     https://www.ptt.cc/bbs/Stock/M.1623337194.A.6E0.html     體量   1
##         total          tf        idf      tf_idf
##      1:  1679 0.208457415 1.55482161 0.324114094
##      2:  3632 0.093612335 1.48095704 0.138635846
##      3:  1170 0.193162393 1.07338445 0.207337508
##      4:  4918 0.045140301 0.07352686 0.003319025
##      5:  4845 0.043137255 0.07352686 0.003171747
##     ---                                         
## 536912:   289 0.003460208 5.73262185 0.019836062
## 536913:   289 0.003460208 2.14602125 0.007425679
## 536914:   289 0.003460208 1.88959172 0.006538380
## 536915:   289 0.003460208 4.56255060 0.015787372
## 536916:   289 0.003460208 7.92984643 0.027438915

結果

# 選出每篇文章,tf-idf值最大的五個詞
vac_words_tf_idf %>% 
  group_by(artUrl) %>%
  slice_max(tf_idf, n=5) %>%
  arrange(desc(artUrl))
## # A tibble: 16,698 x 7
## # Groups:   artUrl [2,779]
##    artUrl                                 word      n total      tf   idf tf_idf
##    <chr>                                  <chr> <int> <int>   <dbl> <dbl>  <dbl>
##  1 https://www.ptt.cc/bbs/Stock/M.162333~ 李秉穎~     5   289 0.0173   4.93 0.0854
##  2 https://www.ptt.cc/bbs/Stock/M.162333~ 笑秉      3   289 0.0104   6.54 0.0679
##  3 https://www.ptt.cc/bbs/Stock/M.162333~ 專業      8   289 0.0277   2.17 0.0600
##  4 https://www.ptt.cc/bbs/Stock/M.162333~ 中和性~     3   289 0.0104   5.73 0.0595
##  5 https://www.ptt.cc/bbs/Stock/M.162333~ 血清      4   289 0.0138   4.24 0.0587
##  6 https://www.ptt.cc/bbs/Stock/M.162333~ 屁眼     23   629 0.0366   4.37 0.160 
##  7 https://www.ptt.cc/bbs/Stock/M.162333~ 尊重     26   629 0.0413   3.76 0.155 
##  8 https://www.ptt.cc/bbs/Stock/M.162333~ 大黑     10   629 0.0159   6.32 0.100 
##  9 https://www.ptt.cc/bbs/Stock/M.162333~ 拍拍      6   629 0.00954  5.44 0.0519
## 10 https://www.ptt.cc/bbs/Stock/M.162333~ 給推      9   629 0.0143   3.42 0.0489
## # ... with 16,688 more rows

計算整個文集中較常 tf-idf 值高的字

# 從每篇文章挑選出tf-idf最大的十個詞,
# 並計算每個詞被選中的次數
vac_words_tf_idf %>% 
  group_by(artUrl) %>%
  slice_max(tf_idf, n=10) %>%
  ungroup() %>%
  count(word, sort=TRUE)
## # A tibble: 20,688 x 2
##    word      n
##    <chr> <int>
##  1 三期    105
##  2 二期     72
##  3 跌停     68
##  4 聯亞     64
##  5 出國     62
##  6 成功     60
##  7 失敗     57
##  8 抗體     51
##  9 解盲     49
## 10 中國     45
## # ... with 20,678 more rows

6.Word Correlation

只使用鄉民的留言評論,以及計算名詞之間同時出現的字眼,一起來看看國產疫苗在鄉民眼中是如何的吧!

處理留言

Rdata$artDate= Rdata$artDate %>% as.Date("%Y/%m/%d")
Rdata_select = 
Rvac_words <- Rdata %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9b-y']",word)) %>% # 去英文、數字
  filter(nchar(.$word)>1)  %>%
  filter(!str_detect(word, regex("[0-9b-yB-Y]"))) %>%
  count(artUrl, word, sort = TRUE)
Rtotal_words <- Rvac_words %>% 
  group_by(artUrl) %>% 
  summarize(total = sum(n))
# 合併需要的資料欄位
Rvac_words <- left_join(Rvac_words, Rtotal_words)

計算兩個詞彙同時出現的總次數

vac_pairs <- Rvac_words %>%
  pairwise_count(word, artUrl, sort = TRUE) 
vac_pairs
## # A tibble: 118,710,578 x 3
##    item1 item2     n
##    <chr> <chr> <dbl>
##  1 國產  疫苗   1452
##  2 疫苗  國產   1452
##  3 高端  疫苗   1418
##  4 疫苗  高端   1418
##  5 台灣  疫苗   1351
##  6 疫苗  台灣   1351
##  7 政府  疫苗   1185
##  8 疫苗  政府   1185
##  9 三期  疫苗   1158
## 10 疫苗  三期   1158
## # ... with 118,710,568 more rows

計算兩個詞彙間的相關性

word_cors <- Rvac_words %>%
  group_by(word) %>%
  filter(n() >= 10) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

word_cors
## # A tibble: 42,139,572 x 3
##    item1 item2 correlation
##    <chr> <chr>       <dbl>
##  1 左手  右手        0.965
##  2 右手  左手        0.965
##  3 混為  一談        0.957
##  4 一談  混為        0.957
##  5 家園  非核        0.953
##  6 非核  家園        0.953
##  7 趕美  超英        0.914
##  8 超英  趕美        0.914
##  9 賢者  之石        0.912
## 10 之石  賢者        0.912
## # ... with 42,139,562 more rows

###共現關係圖 ◆使用詞彙關係圖畫出相關性大於0.5的組合

set.seed(2020)
word_cors %>%
  filter(correlation > 0.7) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 2) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
  theme_void()