1. 套件取得及資料載入

套件載入

library(readr)
library(data.table)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(tidyr)
library(tidytext)
library(stringr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(topicmodels)
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
## 
##     transpose
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
require(RColorBrewer)
## Loading required package: RColorBrewer
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(wordcloud2)
library(widyr)
library(ggraph)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

資料描述

  • 資料來源: 文字平台收集 PTT 八卦版、股票版 2021-04-01 ~ 2021-06-11 所有文章與留言
  • 資料集: 0611_articleMetaData.csv
  • 關鍵字:國產疫苗、高端、聯亞、解盲
  • 資料時間:2021-04-01 ~ 2021-06-11
  • 文章篇數:總共 2779 篇
# 將三個版的資料合併
MetaData = fread('0611_articleMetaData.csv',encoding = 'UTF-8')
Reviews  = fread('0611_articleReviews.csv',encoding = 'UTF-8')

# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent", "artPoster", "cmtPoster")], by = "artUrl")

查看各時段的文章討論數量變化

MetaData %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()

查看細部的時間變化(2021/05-06)

MetaData %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  filter(format(artDate,'%Y%m') %in% c(202105, 202106))%>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()

> 可以看到從 05/15 後國產疫苗的討論升溫,06/10 討論度急速升高。

2. Document Term Matrix (DTM)

資料前處理

使用默認參數初始化一個斷詞引擎

jieba_tokenizer = worker(user="../dict/user_dict.txt", stop_word = "dict/stop_words.txt")
ptt_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
# 把文章和留言的斷詞結果併在一起
#MToken <- MetaData %>% unnest_tokens(word, sentence, token=ptt_tokenizer)
#RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=ptt_tokenizer)

# 把資料併在一起
#data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 

計算每篇文章各token出現次數

tokens <- MetaData %>%
  unnest_tokens(word, sentence, token=ptt_tokenizer) %>%
  filter((!str_detect(word, regex("[0-9a-zA-Z]"))) | str_detect(word, regex("[Aa][Zz]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens %>% head(20)
##                                                       artUrl   word count
##  1: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   阿三     1
##  2: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   不好     1
##  3: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   不禁     1
##  4: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   吹捧     2
##  5: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   大內     1
##  6: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   當初     1
##  7: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html 第三期     2
##  8: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   官員     1
##  9: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   國產     2
## 10: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   國民     1
## 11: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   國內     1
## 12: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   好多     1
## 13: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   好奇     1
## 14: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   機會     1
## 15: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   健康     1
## 16: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   嬌生     1
## 17: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   就要     1
## 18: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   請問     1
## 19: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   實驗     2
## 20: https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html   台灣     2

將資料轉換為Document Term Matrix (DTM)

dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm
## <<DocumentTermMatrix (documents: 2779, terms: 23975)>>
## Non-/sparse entries: 144551/66481974
## Sparsity           : 100%
## Maximal term length: 26
## Weighting          : term frequency (tf)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 17/83
## Sparsity           : 83%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##                                                           Terms
## Docs                                                       阿三 不好 不禁 吹捧
##   https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html    1    1    1    2
##   https://www.ptt.cc/bbs/Gossiping/M.1617764810.A.35C.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618128969.A.D6E.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618141623.A.EE3.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618288420.A.ECD.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618361319.A.A4C.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618363398.A.01A.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618839469.A.619.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1619682751.A.07C.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1619886717.A.AF1.html    0    0    0    0
##                                                           Terms
## Docs                                                       大內 當初 第三期
##   https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html    1    1      2
##   https://www.ptt.cc/bbs/Gossiping/M.1617764810.A.35C.html    0    0      0
##   https://www.ptt.cc/bbs/Gossiping/M.1618128969.A.D6E.html    0    0      0
##   https://www.ptt.cc/bbs/Gossiping/M.1618141623.A.EE3.html    0    0      0
##   https://www.ptt.cc/bbs/Gossiping/M.1618288420.A.ECD.html    0    0      0
##   https://www.ptt.cc/bbs/Gossiping/M.1618361319.A.A4C.html    0    0      0
##   https://www.ptt.cc/bbs/Gossiping/M.1618363398.A.01A.html    0    0      0
##   https://www.ptt.cc/bbs/Gossiping/M.1618839469.A.619.html    0    0      1
##   https://www.ptt.cc/bbs/Gossiping/M.1619682751.A.07C.html    0    0      1
##   https://www.ptt.cc/bbs/Gossiping/M.1619886717.A.AF1.html    0    0      0
##                                                           Terms
## Docs                                                       官員 國產 國民
##   https://www.ptt.cc/bbs/Gossiping/M.1617757128.A.F6D.html    1    2    1
##   https://www.ptt.cc/bbs/Gossiping/M.1617764810.A.35C.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618128969.A.D6E.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618141623.A.EE3.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618288420.A.ECD.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618361319.A.A4C.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618363398.A.01A.html    0    3    0
##   https://www.ptt.cc/bbs/Gossiping/M.1618839469.A.619.html    5    6    2
##   https://www.ptt.cc/bbs/Gossiping/M.1619682751.A.07C.html    0    5    0
##   https://www.ptt.cc/bbs/Gossiping/M.1619886717.A.AF1.html    0    0    0

3. 主題模型

建立LDA模型

lda <- LDA(dtm, k = 2, control = list(seed = 2021))
# lda <- LDA(dtm, k = 2, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs") #調整alpha即delta
lda
## A LDA_VEM topic model with 2 topics.

利用LDA模型建立phi矩陣

topics_words <- tidy(lda, matrix = "beta") #注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words
## # A tibble: 47,950 x 3
##    topic term       phi
##    <int> <chr>    <dbl>
##  1     1 阿三  4.14e-12
##  2     2 阿三  1.23e- 5
##  3     1 不好  4.33e- 4
##  4     2 不好  7.28e- 4
##  5     1 不禁  3.70e- 8
##  6     2 不禁  6.16e- 5
##  7     1 吹捧  4.67e- 5
##  8     2 吹捧  1.16e- 4
##  9     1 大內  1.43e- 4
## 10     2 大內  3.81e- 4
## # … with 47,940 more rows

尋找Topic的代表字

terms依照各主題的phi值由大到小排序,列出前10大代表字

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

4. 尋找最佳主題數

建立更多主題的主題模型

嘗試2、3、4、6、10、15個主題數,將結果存起來,再做進一步分析。
(此部分需要跑一段時間,已經將跑完的檔案存成ldas_result.rdata,可以直接載入)

ldas = c()
# topics = c(2,3,4,6,10,15)
# for(topic in topics){
#  start_time <- Sys.time()
#  lda <- LDA(dtm, k = topic, control = list(seed = 2021))
#  ldas =c(ldas,lda)
#  print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#  save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
# }

載入每個主題的LDA結果

load("ldas_result.rdata")

透過perplexity找到最佳主題數

topics = c(2,3,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

  • 主題數越多,複雜度越低,內容的純度越高。
  • 可以挑選下降幅度減緩的點。

產生LDAvis結果

create LDAvis所需的json function
此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
    ###以下function 用來解決,主題數多會出現NA的問題
    ### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
    ls_LDA = function (phi){
      jensenShannon <- function(x, y) {
        m <- 0.5 * (x + y)
        lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
        rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
        0.5 * sum(lhs) + 0.5 * sum(rhs)
      }
      dist.mat <- proxy::dist(x = phi, method = jensenShannon)
      pca.fit <- stats::cmdscale(dist.mat, k = 2)
      data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
    }
  
      # Find required quantities
      phi <- as.matrix(posterior(fitted)$terms)
      theta <- as.matrix(posterior(fitted)$topics)
      vocab <- colnames(phi)
      term_freq <- slam::col_sums(doc_term)
  
      # Convert to json
      json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                                     vocab = vocab,
                                     doc.length = as.vector(table(doc_term$i)),
                                     term.frequency = term_freq, mds.method = ls_LDA)
  
      return(json_lda)
}
the_lda = ldas[[3]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
serVis(json_res,open.browser = T)

產生LDAvis檔案,存至local端

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

從LDAvis分析結果中可以初度得知這四個主題的討論方向:

  1. 「臨床」、「試驗」、「受試者」等字詞,推測是討論國產疫苗研發情況的相關議題。
  2. 「國產」、「民進黨」、「側翼」等字詞,推測是討論國產疫苗與政治相關的議題。
  3. 「技術」、「蛋白」、「副作用」等字詞,推測是討論國產疫苗技術的相關議題。
  4. 「跌停」、「基亞」、「股價」等字詞,推測是討論國產疫苗影響股價的相關議題。

5. LDA分析

選定4個主題數的主題模型

the_lda = ldas[[3]] ## 選定topic 為 4 的結果
topics_words <- tidy(the_lda, matrix = "beta") #注意!在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)
## # A tibble: 10 x 3
##    topic term     phi
##    <int> <chr>  <dbl>
##  1     2 疫苗  0.0756
##  2     1 疫苗  0.0654
##  3     4 高端  0.0552
##  4     3 疫苗  0.0512
##  5     3 國產  0.0299
##  6     4 疫苗  0.0266
##  7     2 國產  0.0246
##  8     1 國產  0.0187
##  9     2 台灣  0.0181
## 10     1 試驗  0.0178

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

去除共通詞彙

e.g., “疫苗”,“國產”,“有沒有”,“台灣”,“高端” 等等

removed_word = c("疫苗","國產","有沒有","台灣","高端")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

主題命名

topics_name = c("國產疫苗的技術與進度","國產疫苗與國外疫苗","國產疫苗與政治","國產疫苗與股價")

透過上述字詞,可將其分為以下主題:

  1. Topic words 為「試驗」、「臨床」、「三期」、「二期」等等字詞,因此可推得第一張圖的主題為國產疫苗近期的相關技術與臨床試驗進度等議題。
  2. Topic words 為「三期」、「國際」、「az」、「美國」等等字詞,因此可推得第二張圖的主題為國產疫苗與國外疫苗的相關比較與討論。
  3. Topic words 為「三期」、「二期」、「民進黨」等等字詞,因此可以推得第三張圖的主題為國產疫苗與政治的相關討論。
  4. Topic words 為「跌停」、「股價」、「股票」等等字詞,因此可以推得第三張圖的主題為國產疫苗近期影響到股票市場的相關討論。

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[MetaData$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
ptt_topic = cbind(MetaData,document_topics_df)

# 刪除commentNum、push、boo欄位
ptt_topic$commentNum = NULL
ptt_topic$push = NULL
ptt_topic$boo = NULL
# 了解主題在時間的變化
ptt_topic %>%
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) +
  geom_bar(stat = "identity") + ylab("value") +
  scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374"))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

去除筆數少的月份

ptt_topic %>%
  mutate(artDate = as.Date(artDate)) %>%
  filter(!format(artDate,'%Y%m') %in% c(202104))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) +
  geom_bar(stat = "identity") + ylab("value") +
  scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374"))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

以比例了解主題時間變化(以月計算)

ptt_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  filter( format(artDate,'%Y%m') %in% c(202105, 202106))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
  scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374"))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  theme_grey(base_family = "STKaiti" ) #避免中文出現亂碼

以比例了解主題時間變化(以日計算)

ptt_topic %>%
  mutate(artDate = as.Date(artDate)) %>%
  filter(!format(artDate,'%Y%m') %in% c(202104))%>%
  group_by(artDate = format(artDate,'%m%d')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
  geom_bar(stat = "identity") + ylab("proportion") +
  scale_fill_manual(values=c("#cacaca","#a9c6de","#5588a3","#145374"))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme_grey(base_family = "STKaiti" ) + #避免中文出現亂碼
  geom_col(position="fill") -> pt

ggplotly(pt)

其中,從圖中的主題分佈可以推得:

  • 2021/05 月初時較多討論是關於國產疫苗與政治,以及國產疫苗技術與進度的相關議題。
  • 2021/05-2021/06 月關於國產疫苗與國外疫苗的議題是在討論的。
  • 2021/06 開始,關於國產疫苗與股價的相關議題開始有一定的討論度。

6. Graph

資料處理

phi_m <- topics_words %>% arrange(desc(phi)) %>% top_n(70)
## Selecting by phi
dtm <-phi_m %>% cast_dtm(topic, term, phi)

dtmm<-as.matrix(dtm)
dim(dtmm)
## [1]  4 50

根據各主題代表字畫圖

network=graph_from_incidence_matrix(dtmm)

# plot
set.seed(3)
plot(network, ylim=c(-1,1), xlim=c(-1,1), asp = 0,
     vertex.label.cex=0.7, vertex.size=10, vertex.label.family = "Heiti TC Light")

去除共同字後再畫一次

removed_word = c("高端","疫苗","台灣","已經","有沒有","一定","目前")
phi_m <- topics_words %>%
  filter(!term  %in% removed_word) %>% 
  arrange(desc(phi)) %>% 
  top_n(70)
## Selecting by phi
dtm <-phi_m %>% cast_dtm(topic, term, phi)

dtmm <- as.matrix(dtm)
dim(dtmm)
## [1]  4 52
#set.seed(1)
network = graph_from_incidence_matrix(dtmm)

# plot
set.seed(3)
plot(network, ylim = c(-1,1), xlim = c(-1,1), asp = 0,
     vertex.label.cex = 0.7, vertex.size = 10, vertex.label.family = "Heiti TC Light") 

從圖中可以印證以上的主題分類:

  • 第一個主題主要在討論關於國產疫苗的研發進度與臨床試驗結果。
  • 第二個主題主要在討論關於國產以苗與國外疫苗的議題。
  • 第三個主題主要在討論關於國產疫苗與政治的交互影響。
  • 第四個主題主要在討論關於國產疫苗與股票股價的交互影響。

7. 社群網路圖

整理文章討論參與人

# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結)三個欄位
link <- Reviews %>%
      select(cmtPoster, artPoster.x, artUrl)

reviewNetwork <- graph_from_data_frame(d=link, directed=T)
## Warning in graph_from_data_frame(d = link, directed = T): In `d' `NA' elements
## were replaced with string "NA"
# 發文者數量 1659
length(unique(MetaData$artPoster))
## [1] 1659
# 留言者數量 28963
length(unique(Reviews$cmtPoster))
## [1] 28963
# 參與者總數量 29484
allPoster <- c(MetaData$artPoster, Reviews$cmtPoster)
length(unique(allPoster))
## [1] 29484
# 整理所有出現過的使用者:
# 若曾發過文則標註爲:Poster;不曾發過文則標註爲:Replyer
userList <- data.frame(user = unique(allPoster)) %>%
  mutate(type = ifelse(user%in%MetaData$artPoster, "poster", "replyer"))

資料篩選

# 看一下留言數大概都多少(方便後面篩選)
MetaData %>%
filter(commentNum < 100) %>%
  ggplot(aes(x=commentNum)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

文章回覆數量大約在回覆次數 30 後就比較少了,因此回覆數可以先抓 30。

tang_poster = table(MetaData$artPoster) %>% sort %>% as.data.frame 
colnames(tang_poster) = c("artPoster.x","freq")
tang_poster = tang_poster %>% filter(freq >= 5) # 發文次數 > 5

link <- Reviews %>%
      filter(commentNum >= 30) %>% # 回覆數 > 30
      filter(artPoster.x==tang_poster$artPoster.x) %>%
      select(cmtPoster, artPoster.x, artUrl) %>% 
      unique()
## Warning in `==.default`(artPoster.x, tang_poster$artPoster.x): 較長的物件長度並
## 非較短物件長度的倍數
## Warning in is.na(e1) | is.na(e2): 較長的物件長度並非較短物件長度的倍數
# 篩選 link 中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster.x) %>%
          arrange(desc(type))

建立網路關係圖

set.seed(487)
# 先把關係的方向性拿掉,減少圖片中的不必要的資訊
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=.2, vertex.label=NA)

加強圖像顯示資訊

set.seed(487)
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
# 依使用者的身份來區分點的顏色:有發文的話是紅色,只有回覆文章的則是淺藍色
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "red", "lightblue")

# 顯示超過 20 個關聯的使用者帳號
plot(reviewNetwork, vertex.size = 3, edge.arrow.size=.2,
     vertex.label=ifelse(degree(reviewNetwork) >= 20, V(reviewNetwork)$label, NA),  vertex.label.font = 2)

從圖中可以發現 Emacs、CavendishJr、f1317913、zzahoward 這四個帳號發的文章中有較多的回覆。

意見領袖分析

進一步分析 Emacs、CavendishJr、f1317913、zzahoward 這四位意見領袖

leader_data <- MetaData %>% 
  filter((artPoster == "Emacs")|(artPoster == "CavendishJr")|(artPoster == "f1317913")|(artPoster == "zzahoward")) 

leader_data$artDate = as.Date(leader_data$artDate)
leader_data = leader_data %>% mutate(months = as.Date(cut(artDate, "days")))
leader_data_month = leader_data %>% group_by(months,artPoster) %>%
  summarise(num=n()) %>% as.data.frame
## `summarise()` has grouped output by 'months'. You can override using the `.groups` argument.
# 整合發文趨勢圖
leader_data_month %>% ggplot(aes(x= months,y=num,fill=artPoster))  +geom_bar(stat = "identity")+
  facet_wrap(~artPoster, ncol = 2, scales = "fixed") 

由圖中可以推測:

  • CavendishJr 算是一直都有在關注國產疫苗的相關議題。
  • Emacs、zzahoward 是在近期(5/15後)國內疫情再度升溫後,才對於國產疫苗的議題有較多的關注。
  • f1317913 在近期(5/15後)國內疫情再度升溫後,對於國產疫苗的議題有較多的關注,但這陣子的討論又變得比較少。

作者 CavendishJr 分析

文字雲

CavendishJr_data <- leader_data %>% 
  filter(artPoster == "CavendishJr")
CavendishJr_sentence <- CavendishJr_data %>% 
  select(artUrl,sentence)
  
CavendishJr_sentence <-strsplit(CavendishJr_sentence$sentence,"[。!;?!?;]")

# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
CavendishJr_sentence  <- data.frame(
  artUrl = rep(CavendishJr_data$artUrl, sapply(CavendishJr_sentence, length)), 
  sentence = unlist(CavendishJr_sentence)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
CavendishJr_sentence$sentence <- as.character(CavendishJr_sentence$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="../dict/user_dict.txt", stop_word = "dict/stop_words.txt")
ptt_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞,並計算各詞彙在各文章中出現的次數
CavendishJr_word <- CavendishJr_sentence %>%
  unnest_tokens(word, sentence, token=ptt_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

# 畫出文字雲
CavendishJr_word %>%
  group_by(word) %>%
  summarise(sum = n()) %>%
  filter(sum > 2)  %>%
  arrange(desc(sum)) %>%
  wordcloud2()

詞彙相關性

# 計算兩個詞彙間的相關性
CavendishJr_word_cors <- CavendishJr_word %>%
  group_by(word) %>%
  filter(n() >= 8) %>%
  pairwise_cor(word, artUrl, sort = TRUE)
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
CavendishJr_word_cors %>%
  filter(correlation > 0.3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family='STXihei') +
  theme_void() 

對應到前面各作者在每個月文章數量分析圖,作者 CavendishJr 文章較多的期間是在2021/05 月疫情升溫之前,以及2021/06 開始的這段時間,結合文字雲和詞彙相關性分析後可以發現,這個作者主要關注的議題在於國產疫苗的研發進度與目遣的臨床試驗結果。

作者 Emacs 分析

文字雲

Emacs_data <- leader_data %>% 
  filter(artPoster == "Emacs")
Emacs_sentence <- Emacs_data %>% 
  select(artUrl,sentence)
  
Emacs_sentence <-strsplit(Emacs_sentence$sentence,"[。!;?!?;]")

# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
Emacs_sentence  <- data.frame(
  artUrl = rep(Emacs_data$artUrl, sapply(Emacs_sentence, length)), 
  sentence = unlist(Emacs_sentence)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
Emacs_sentence$sentence <- as.character(Emacs_sentence$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="../dict/user_dict.txt", stop_word = "dict/stop_words.txt")
ptt_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞,並計算各詞彙在各文章中出現的次數
Emacs_word <- Emacs_sentence %>%
  unnest_tokens(word, sentence, token=ptt_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

# 畫出文字雲
Emacs_word %>%
  group_by(word) %>%
  summarise(sum = n()) %>%
  filter(sum > 1)  %>%
  arrange(desc(sum)) %>%
  wordcloud2()

詞彙相關性

# 計算兩個詞彙間的相關性
Emacs_word_cors <- Emacs_word %>%
  group_by(word) %>%
  filter(n() >= 3) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

Emacs_word_cors %>%
  filter(correlation > 0.3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family='STXihei') +
  theme_void() 

對應到前面各作者在每個月文章數量分析圖,作者 Emacs 文章較多的期間是在2021/05 月疫情升溫之後一直到現在這段時間,結合文字雲和詞彙相關性分析後可以發現,這個作者主要關注的議題除了疫情的相關討論,e.g.,國產疫苗、疫苗、口罩等等,另外對國產疫苗與股票的相關議題也非常關注。

作者 f1317913 分析

文字雲

f1317913_data <- leader_data %>% 
  filter(artPoster == "f1317913")
f1317913_sentence <- f1317913_data %>% 
  select(artUrl,sentence)
  
f1317913_sentence <-strsplit(f1317913_sentence$sentence,"[。!;?!?;]")

# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
f1317913_sentence  <- data.frame(
  artUrl = rep(f1317913_data$artUrl, sapply(f1317913_sentence, length)), 
  sentence = unlist(f1317913_sentence)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
f1317913_sentence$sentence <- as.character(f1317913_sentence$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="../dict/user_dict.txt", stop_word = "dict/stop_words.txt")
ptt_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞,並計算各詞彙在各文章中出現的次數
f1317913_word <- f1317913_sentence %>%
  unnest_tokens(word, sentence, token=ptt_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

# 畫出文字雲
f1317913_word %>%
  group_by(word) %>%
  summarise(sum = n()) %>%
  filter(sum > 1)  %>%
  arrange(desc(sum)) %>%
  wordcloud2()

詞彙相關性

# 計算兩個詞彙間的相關性
f1317913_word_cors <- f1317913_word %>%
  group_by(word) %>%
  filter(n() >= 3) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

f1317913_word_cors %>%
  filter(correlation > 0.3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family='STXihei') +
  theme_void() 

對應到前面各作者在每個月文章數量分析圖,作者 f1317913 文章較多的期間是在2021/ 05 月中旬到2021/06 月初這段時間,結合文字雲和詞彙相關性分析後可以發現,這個作者主要關注的是國產疫苗的研發和臨床試驗相關的生技議題,而對於最近有較多討論聲量的股票議題沒有關注。

作者 zzahoward 分析

文字雲

zzahoward_data <- leader_data %>% 
  filter(artPoster == "zzahoward")
zzahoward_sentence <- zzahoward_data %>% 
  select(artUrl,sentence)
  
zzahoward_sentence <-strsplit(zzahoward_sentence$sentence,"[。!;?!?;]")

# 將每個句子與所屬的文章連結配對起來,整理成 dataframe
zzahoward_sentence  <- data.frame(
  artUrl = rep(zzahoward_data$artUrl, sapply(zzahoward_sentence, length)), 
  sentence = unlist(zzahoward_sentence)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
zzahoward_sentence$sentence <- as.character(zzahoward_sentence$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="../dict/user_dict.txt", stop_word = "dict/stop_words.txt")
ptt_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞,並計算各詞彙在各文章中出現的次數
zzahoward_word <- zzahoward_sentence %>%
  unnest_tokens(word, sentence, token=ptt_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

# 畫出文字雲
zzahoward_word %>%
  group_by(word) %>%
  summarise(sum = n()) %>%
  filter(sum > 1)  %>%
  arrange(desc(sum)) %>%
  wordcloud2()

詞彙相關性

# 計算兩個詞彙間的相關性
zzahoward_word_cors <- zzahoward_word %>%
  group_by(word) %>%
  filter(n() >= 5) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

zzahoward_word_cors %>%
  filter(correlation > 0.3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family='STXihei') +
  theme_void() 

對應到前面各作者在每個月文章數量分析圖,作者 zzahoward 文章較多的期間是在2021/05 月疫情升溫後一直到現在的這段時間,結合文字雲和詞彙相關性分析後可以發現,這個作者主要關注的議題在於國產疫苗的臨床實驗與技術,以及對於國際疫苗的相關討論。