0.目標:

  • 資料來源: 文字平台收集PTT Gossip/版文章、回覆

  • 資料集: Corrected_articleMetaData.csv、Corrected_articleReviews.csv

  • 關鍵字:校正回歸、校正迴歸、疫情、確診 “疫情,台灣疫情,新冠肺炎,新冠,新冠病毒,病毒傳播,病毒,肺炎,武漢肺炎,COVID,COVID19,COVID-19,染疫,罹患, 傳染,傳播,散佈,散布,確診,確診個案,確診人數,人數,數量,確診數量,三階,三級,四級,四階,群聚,群聚感染, 感染,疑似,疑似確診,人,多少人,多少個案,多少,數字,預測,預估,猜測,推論,推斷,夢見,封城,人數增加,人數減少”

  • 資料時間:2021-06-02 ~ 2021-06-06

  • 探討主題: >1.日本捐疫苗的討論、美國參議完來臺宣布美國捐疫苗的討論重點有哪些?
    >2.討論文章的聲量隨特定時間點可看出什麼趨勢?
    >3.討論捐贈疫苗的社群網路如何分布?
    >4.討論美日捐贈疫苗的意見領袖有誰?網友的推噓狀態如何?

1.前置作業

設定

if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
## Warning: package 'pacman' was built under R version 4.0.5
Sys.setlocale(category = "LC_ALL", locale = "Chinese (Traditional)_Taiwan.950") 
## [1] "LC_COLLATE=Chinese (Traditional)_Taiwan.950;LC_CTYPE=Chinese (Traditional)_Taiwan.950;LC_MONETARY=Chinese (Traditional)_Taiwan.950;LC_NUMERIC=C;LC_TIME=Chinese (Traditional)_Taiwan.950"
# 避免中文亂碼zh_TW.UTF-8

載入套件

pacman::p_load("MASS","dplyr", "ggplot2", "tidyr", "scales","caret","tinytex", "tidytext", "stringr", "wordcloud2",'readr','data.table','reshape2',"jiebaR","corrplot","Hmisc","fmsb","sentimentr","text2vec","RTextTools","SnowballC","lubricate","purrr","lime","ldatuning","igraph", "topicmodels","rtweet","tmcn","textmineR","tm","Matrix","gmp")
## Installing package into 'C:/Users/Davis Liu/Documents/R/win-library/4.0'
## (as 'lib' is unspecified)
## Warning: package 'lubricate' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.0:
##   無法開啟 URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.0/PACKAGES'
## Bioconductor version '3.12' is out-of-date; the current release version '3.13'
##   is available with R version '4.1'; see https://bioconductor.org/install
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'lubricate'
## Warning in pacman::p_load("MASS", "dplyr", "ggplot2", "tidyr", "scales", : Failed to install/load:
## lubricate

2.載入資料

posts <- fread("C:/Users/Davis Liu/Documents/R/SOCIAL MEDIA ANALYSIS/endterm/0604_articleMetaData.csv", encoding = 'UTF-8') # 文章 2655
reviews <- fread("C:/Users/Davis Liu/Documents/R/SOCIAL MEDIA ANALYSIS/endterm/0604_articleReviews.csv", encoding = 'UTF-8') # 回覆 207237

head(posts)
head(reviews)

3.資料前處理

3.1文章斷句

# 文章斷句("\n\n"取代成"。")
mask_meta <- posts %>%
              mutate(sentence=gsub("[\n]{2,}", "。", sentence))

# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
mask_sentences <- strsplit(mask_meta$sentence,"[。!;?!?;]")

# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
mask_sentences <- data.frame(
                        artUrl = rep(mask_meta$artUrl, sapply(mask_sentences, length)),
                        sentence = unlist(mask_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
                      # 如果有\t或\n就去掉

mask_sentences$sentence <- as.character(mask_sentences$sentence)
mask_sentences
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker(stop_word = "C:\\Users\\Davis Liu\\Documents\\R\\win-library\\4.0\\jiebaRD\\dict\\stop_words.utf8")

# 使用口罩字典重新斷詞
#new_user_word(jieba_tokenizer, c(mask_lexicon))

# tokenize function
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 用剛剛初始化的斷詞器把sentence斷開
tokens <- mask_sentences %>%
    mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
    #mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
    unnest_tokens(word, sentence, token=chi_tokenizer) 
head(tokens)

3.2斷詞結果可以先存起來,就不用再重跑一次

save.image(file = "C:/Users/Davis Liu/Documents/R/SOCIAL MEDIA ANALYSIS/endterm/token_result.rdata")
## Warning in save(list = names(.GlobalEnv), file = outfile, version = version, :
## 載入時 'package:tmcn' 也許不能用
load("C:/Users/Davis Liu/Documents/R/SOCIAL MEDIA ANALYSIS/endterm/token_result.rdata")

3.3清理斷詞結果

1.根據詞頻,選擇只出現3字以上的字
2.整理成url,word,n的格式之後,就可以轉dtm

freq = 3
# 依據字頻挑字
reserved_word <- tokens %>% 
  group_by(artUrl, word)%>%
  mutate(count=n())%>%# 計算每篇文章出現的字頻
  ungroup()%>%
  group_by(word) %>% 
  count() %>% 
  filter(n > freq) %>% 
  unlist()



mask_removed <-tokens %>% 
  group_by(artUrl, word)%>%
  mutate(count=n())%>%# 計算每篇文章出現的字頻
  ungroup()%>%
  filter(word %in% reserved_word)

3.4做成DTM

#mask_dtm 裡面 nrow:幾篇文章 ; ncol:幾個字
mask_dtm <- mask_removed %>% 
  cast_dtm(artUrl, word, count) 
#這個沒辦法畫圖

# cast into a Matrix object
dtm_spare <- mask_removed %>%
  cast_sparse(artUrl, word, count)#這個稀疏的才可以畫圖

4.進行LDA 主題分析

4.1決定最適合的topic數

Coherence & prevalence score(集中性與廣泛性分數)

k_list <- seq(1, 11, by = 1)

model_dir <- paste0("models_", digest::digest(vocabulary, algo = "sha1"))
if (!dir.exists(model_dir)) dir.create(model_dir)


model_list <- TmParallelApply(X = k_list, FUN = function(k){
  filename = file.path(model_dir, paste0(k, "_topics.rda"))
    if (!file.exists(filename)) {
    m <- FitLdaModel(dtm = dtm_spare, k = k, iterations = 100)
    m$k <- k
    m$coherence <- CalcProbCoherence(phi = m$phi, dtm = dtm_spare, M = 5)
    save(m, file = filename)} else {load(filename)}
    m
    }
, export=c("dtm_spare", "model_dir")) # export only needed for Windows machines
#model tuning
#choosing the best model

#m=lda_topic_model
coherence_mat <- data.frame(k = sapply(model_list, function(x) nrow(x$phi)), 
                            coherence = sapply(model_list, function(x) mean(x$coherence)),
                            prevalence = sapply(model_list, function(x) mean(colSums(x$theta)/sum(x$theta)*100)),
                            stringsAsFactors = FALSE)
coherence_mat

Coherence score(每個主題間的內聚力,分數越高越好)
Prevalence score(不同主題間的關聯性,分數越低越好)
這邊推薦以七個主題模型

4.1.1 Coherence & prevalence

coherence_mat %>%
  mutate(max_k = which(coherence == max(coherence))) %>%
  pivot_longer(cols = c(coherence,prevalence)) %>%
  ggplot(aes(x = k, y = value, group = 1)) +
  geom_point() + geom_line() +
  geom_vline(aes(xintercept = max_k), alpha = 0.5, lty = "dashed") +
  facet_wrap(~name,scales = "free_y",nrow = 2) +
  scale_x_continuous(breaks = seq(1,20,1))+
  theme_minimal() +
  labs(title = "Topics by coherence and prevalence score",
       subtitle = "這張圖很難做ㄝ!!!",
       x = "Number of Topics", y = "Value")

另外以 metrics算出最佳topics數量 以四種方式預測最佳值,其中Griffiths2004, “CaoJuan2009”, “Arun2010”,結果推薦以10或11個主題模型。 另外Deveaud2014推薦以五個主題模型,所以我們嚐試5&10個主題。

result <- FindTopicsNumber(
  dtm_spare,
  topics = seq(from = 2, to = 11, by = 1),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 1),
  mc.cores = 3L,
  verbose = TRUE
)
## fit models... done.
## calculate metrics:
##   Griffiths2004... done.
##   CaoJuan2009... done.
##   Arun2010... done.
##   Deveaud2014... done.

4.1.2 Minimization

FindTopicsNumber_plot(result)

4.1.3 Cluster Dendrogram

先來看看如果分成10個主題會怎樣?
會分類為兩個主要議題再向下延伸,後續觀察兩大主題為何?

#load the rda file
load(file = "C:/Users/Davis Liu/Documents/R/SOCIAL MEDIA ANALYSIS/endterm/10_topics.rda")

m$linguistic <- CalcHellingerDist(m$phi)
m$hclust <- hclust(as.dist(m$linguistic),"ward.D")
m$hclust$labels <- paste(m$hclust$labels, m$labels[,1])
plot(m$hclust)

4.1.4 Silhouette Plot

輪廓指的是一種反映數據聚類結果一致性的方法
如果某一樣本的輪廓接近1,則說明樣本聚類結果合理;如果接近-1,則說明其更應該分類到其他的簇
如果輪廓近似為0,則說明該樣本在兩個簇的邊界上。
可以看到主題2較不一致,另外主題6,7,8,9較有一致性。

library(cluster)
plot(silhouette(cutree(m$hclust,3), as.dist(m$linguistic)))

這邊可以看出若疊代超過80次以上,概似函數增加有限,所以我們以80次設定減少運算時間。

# log Likelihood (does not consider the prior) 
plot(m$log_likelihood, type = "l")

就跑10個跟5個吧!!

4.2 開始LDA函式分析

4.2.1分成10主題

rm(m)
rm(dtm_spare)
rm(tokens)
rm(coherence_mat)
# LDA分成10個主題
mask_lda <- LDA(mask_dtm, k = 10, control = list(seed = 1) ,iter = 100)

p.s. 。tidy(mask_lda, matrix = “beta”) # 取字 topic term beta值 。tidy(mask_lda, matrix=“gamma”) # 取主題 document topic gamma

4.2.2 取出代表字詞(term)

removed_word = c("不是","每天","出來","覺得","什麼") 

# 看各群的常用詞彙
tidy(mask_lda, matrix = "beta") %>% # 取出topic term beta值
  filter(! term %in% removed_word) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>% # beta值前10的字
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

可以歸納出:主題可分為兩類:國產疫苗與外國捐贈疫苗
topic 1 = “總統感謝日本政府贈送疫苗(贈)”
topic 2 = “主題2集中度較差,主要是疫苗接種(國)”
topic 3 = “國產疫苗的二期試驗討論(國)”
topic 4 = “BNT疫苗的捐贈(贈)”
topic 5 = “主要討論國產疫苗(國)”
topic 6 = “我們與其他國家生產疫苗比較(國)”
topic 7 = “感謝日本與美國贈送疫苗(贈)”
topic 8 = “指揮中心對疫苗的報導(贈)”
topic 9 = “對AZ疫苗與其他廠牌的討論(贈)”
topic 10 = “對於美國在疫苗上扮演角色的討論(贈)”

4.2.3 如果改成5個主題呢?

# LDA分成5個主題
lda5 <- LDA(mask_dtm, k = 5, control = list(seed = 1) ,iter = 100)

4.2.4 取出代表字詞(term)

removed_word = c("不是","每天","出來","覺得","什麼") 

# 看各群的常用詞彙
tidy(lda5, matrix = "beta") %>% # 取出topic term beta值
  filter(! term %in% removed_word) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>% # beta值前10的字
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

> topic 1 = “總統感謝日本政府贈送疫苗”
> topic 2 = “疫苗施打的排序”
> topic 3 = “美國贈與疫苗的討論”
> topic 4 = “關於BNT疫苗與AZ疫苗的討論”
> topic 5 = “關於國產疫苗的討論”

4.2.5 主題結論:

5個主題較鬆散因此我們還是選擇10個主題

4.3 取出代表主題(topic)

每篇文章拿gamma值最大的topic當該文章的topic

rm(lda5)

# 在tidy function中使用參數"gamma"來取得 theta矩陣
mask_topics <- tidy(mask_lda, matrix="gamma") %>% # document topic gamma
                  group_by(document) %>%
                  top_n(1, wt=gamma)
head(mask_topics)

以下我們挑出第 5個主題與第 7個主題來做比較。

4.4 資料內容探索

4.4.1 各主題內有那些文章

posts_topic <- merge(x = posts, y = mask_topics, by.x = "artUrl", by.y="document")
head(posts_topic)
# 看一下各主題在說甚麼
#set.seed(123)
#posts_topic %>% # 主題1
#  filter(topic==1) %>%
#  dplyr::select(artTitle) %>%
#  unique() %>%
#  sample_n(10)
#
#posts_topic %>% # 主題7
#  filter(topic==7) %>%
#  dplyr::select(artTitle) %>%
#  unique() %>%
#  sample_n(10)

4.4.2 每個主題內的token數

主題5與主題7token數最多

topic_token <- merge(x = mask_removed, y = mask_topics, by.x = "artUrl", by.y="document")
topic_token %>% 
  group_by(topic)%>%
  count(word)%>%
  ggplot(aes(x=topic,y=n)) + 
  geom_col() + 
  scale_x_continuous(breaks = seq(1,10,1))+
  #facet_wrap(~ topic) + 
  labs(title = "token-topic proportion", y = "count")

4.4.3 Distribution of article probabilities for each topic

gamma值介於0~1之間,數值越大表示分類中的文章主題較集中
主題5國產疫苗,與主題7感謝美日的疫苗主題明確且集中,表示這兩類議題討論較多

ggplot(posts_topic) + 
  geom_histogram(aes(gamma, fill = topic), show.legend = FALSE) + 
  facet_wrap(~ topic) + 
  labs(y = "Number of articles",
       x = expression(gamma),
       title = "Distribution of article probabilities for each topic")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4.4.4 日期主題分布 Topic Proportion over Time

畫出每天topic的分布。

posts_topic %>%
  mutate(artDate = as.Date(artDate,format="%Y/%m/%d")) %>% 
  group_by(artDate,topic) %>%
  summarise(sum =sum(topic)) %>%
  ggplot(aes(x= artDate,y=sum,fill=as.factor(topic))) +
  geom_col(position="fill") 
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

posts_topic %>%
  mutate(artDate = as.Date(artDate,format="%Y/%m/%d")) %>% 
  group_by(artDate,topic) %>%
  summarise(sum =sum(topic)) %>%
  ggplot(aes(x= artDate,y=sum,fill=as.factor(topic))) +
  geom_col(position="dodge") 
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

topic 1 = “總統感謝日本政府贈送疫苗(贈)”
topic 2 = “主題2集中度較差,主要是疫苗接種(國)”
topic 3 = “國產疫苗的二期試驗討論(國)”
topic 4 = “BNT疫苗的捐贈(贈)”
topic 5 = “主要討論國產疫苗(國)”
topic 6 = “我們與其他國家生產疫苗比較(國)”
topic 7 = “感謝日本與美國贈送疫苗(贈)”
topic 8 = “指揮中心對疫苗的報導(贈)”
topic 9 = “對AZ疫苗與其他廠牌的討論(贈)”
topic 10 = “對於美國在疫苗上扮演角色的討論(贈)”

主題7(贈送國外疫苗)的討論度由3號消息確定到4號日本AZ疫苗抵達臺灣維持最高點, 後6號因美國參議員訪臺維持一定聲量。
主題5(國產疫苗)的討論度隨著主題7的熱度增減。

posts_topic %>%
  group_by(artCat,topic) %>%
  summarise(sum = n())  %>%
  ggplot(aes(x= artCat,y=sum,fill=as.factor(topic))) +
  geom_col(position="dodge") 
## `summarise()` has grouped output by 'artCat'. You can override using the `.groups` argument.

4.5 小結:

這次我們把討論焦點放在國產疫苗跟國外疫苗上,從主題分布大概可以看到兩類觀點:

  • 主題:國產疫苗

  • 主題:國外疫苗

5.繪製社群網路圖

5.1資料合併

# 文章和留言
reviews <- reviews %>%
      dplyr::select(artUrl, cmtPoster, cmtStatus, cmtContent)
posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl")

# 把文章和topic
posts_Reviews <- merge(x = posts_Reviews, y = mask_topics, by.x = "artUrl", by.y="document")

head(posts_Reviews)

取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位

link <- posts_Reviews %>% 
  dplyr::select(cmtPoster, artPoster, artUrl)

head(link)

5.2 基本網路圖

5.2.1 建立網路關係

reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork
## IGRAPH 9da3865 DN-- 27883 207337 -- 
## + attr: name (v/c), artUrl (e/c)
## + edges from 9da3865 (vertex names):
##  [1] flavorBZ    ->cpblgu nowitzki0207->cpblgu rpm2500     ->cpblgu
##  [4] rpm2500     ->cpblgu rpm2500     ->cpblgu isapollo    ->cpblgu
##  [7] rpm2500     ->cpblgu fcuspy      ->cpblgu kingstongyu ->cpblgu
## [10] optw        ->cpblgu rfire       ->cpblgu fcuspy      ->cpblgu
## [13] lululun     ->cpblgu kingstongyu ->cpblgu onceuse     ->cpblgu
## [16] kingstongyu ->cpblgu kingstongyu ->cpblgu onceuse     ->cpblgu
## [19] sss010631   ->cpblgu cccict      ->cpblgu tingover    ->cpblgu
## [22] onceuse     ->cpblgu kingstongyu ->cpblgu tingover    ->cpblgu
## + ... omitted several edges

5.2.2 直接畫

因為點沒有經過篩選,看起來會密密麻麻的

# 畫出網路圖(密集恐懼警告)
#plot(reviewNetwork)
#plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,vertex.label=NA)

5.3 資料篩選後畫圖

5.3.1資料篩選的方式:

文章:文章日期、留言數(commentNum)
link & node:degree

# 看一下留言數大概都多少(方便後面篩選)
posts %>%
#  filter(commentNum<100) %>%
  ggplot(aes(x=commentNum)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5.3.2 依據發文數或回覆數篩選post和review

# 帳號發文篇數
post_count = posts %>%
  group_by(artPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
post_count
# 帳號回覆總數
review_count = reviews %>%
  group_by(cmtPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
review_count
# 發文者
poster_select <- post_count %>% filter(count >= 2)
posts <- posts %>%  filter(posts$artPoster %in% poster_select$artPoster)

# 回覆者
reviewer_select <- review_count %>%  filter(count >= 20)
reviews <- reviews %>%  filter(reviews$cmtPoster %in% reviewer_select$cmtPoster)
# 檢視參與人數
length(unique(posts_Reviews$artPoster)) # 發文者數量
## [1] 1769
length(unique(posts_Reviews$cmtPoster)) # 回覆者數量
## [1] 27301
allPoster <- c(posts_Reviews$artPoster, posts_Reviews$cmtPoster) # 總參與人數
length(unique(allPoster))
## [1] 27883

發文者數量1769、回覆者數量27301、總參與人數27883

5.3.3 依使用者

標記所有出現過得使用者

  • poster:只發過文、發過文+留過言
  • replyer:只留過言
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%posts$artPoster, "poster", "replyer"))
head(userList,3)

5.4 劃出某日的社群關聯圖

5.4.1 以6/4為例

6/4日本捐贈AZ疫苗抵達臺灣,同時美國在台協會就美國全球疫苗分配之聲明議題亦提及捐贈臺灣,因此我們挑出當天的文章和回覆看看

link <- posts_Reviews %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>3) %>% 
      filter(commentNum > 200) %>%
      filter(artCat=="Gossiping") %>% 
      filter(artDate == as.Date('2021-06-04')) %>%
      dplyr::select(cmtPoster, artPoster, artUrl) %>% 
      unique()
link

篩選在link裡面有出現的使用者

filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
head(filtered_user,3)

因爲圖片箭頭有點礙眼,所以這裏我們先把關係的方向性拿掉,減少圖片中的不必要的資訊
set.seed 因為igraph呈現的方向是隨機的

set.seed(487)
# v=filtered_user

reviewNetwork = degree(reviewNetwork) > 2
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)

5.4.2 加上nodes的顯示資訊

用使用者的身份來區分點的顏色

  • poster:gold(有發文)
  • replyer:lightblue(只有回覆文章)
set.seed(487)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)

可以稍微看出圖中的點(人)之間有一定的關聯,不過目前只有單純圖形我們無法分析其中的內容。
因此以下我們將資料集中的資訊加到我們的圖片中。

5.4.3 為點加上帳號名字

用degree篩選要顯示出的使用者,以免圖形被密密麻麻的文字覆蓋

filter_degree = 20
set.seed(123)

# 設定 node 的 label/ color
labels <- degree(reviewNetwork) # 算出每個點的degree
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")

plot(
  reviewNetwork, 
  vertex.size=3, 
  edge.width=3, 
  vertex.label.dist=1,
  vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

我們可以看到基本的使用者關係,但是我們希望能夠將更進階的資訊視覺化。
例如:使用者經常參與的文章種類,或是使用者在該社群網路中是否受到歡迎。

5.4.4 Histogram of node degree

檢查一下我們的圖的degree
文章的關聯性大部分為1,有部分被引文而增加,最多大概到10篇左右

V(reviewNetwork)$degree <- degree(reviewNetwork)

hist(V(reviewNetwork)$degree,
     col = 'green',
     main = 'Histogram of Node Degree',
     ylab = 'Frequency',
     xlab = 'Degree of Vertices')

5.5 以主題篩選社群來畫圖

  • 抓link

挑選出2021-06-04到2021-06-06的文章,篩選一篇文章回覆3次以上者,且文章留言數多餘200則,
文章主題可歸類為兩大類1(感謝日本捐贈)與2(疫苗接種)者,
欄位只取:cmtPoster(評論者), artPoster(發文者), artUrl(文章連結), topic(主題)

5.5.1 主題選擇

link <- posts_Reviews %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>3) %>% 
      filter(commentNum > 200) %>%
      filter(artCat=="Gossiping") %>% #HatePolitics / Gossiping
      filter(artDate >= as.Date('2021-06-04')) %>%
      filter(topic == 1 | topic == 2) %>% 
      dplyr::select(cmtPoster, artPoster, artUrl, topic) %>% 
      unique()
link
  • 抓nodes > 在所有的使用者裡面,篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
head(filtered_user,3)

5.5.2 使用者經常參與的文章種類

filter_degree = 13

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")

# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "2", "palevioletred", "lightgreen")

# 畫出社群網路圖(degree>7的才畫出來)
set.seed(5432)
plot(reviewNetwork, vertex.size=3, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21, 
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("感謝日本捐贈","疫苗接種"), 
       col=c("palevioletred", "lightgreen"), lty=1, cex=1)

5.5.3 使用者是否受到歡迎

PTT的回覆有三種,推文、噓文、箭頭,我們只要看推噓就好,因此把箭頭清掉,這樣資料筆數較少,所以我們把篩選的條件放寬一些。

filter_degree = 7 # 使用者degree

# 過濾留言者對發文者的推噓程度
link <- posts_Reviews %>%
      filter(artCat=="Gossiping") %>% 
      filter(commentNum > 100) %>%
      filter(cmtStatus!="→") %>%
      group_by(cmtPoster, artUrl) %>%
      filter( n() > 2) %>%
      ungroup() %>% 
      dplyr::select(cmtPoster, artPoster, artUrl, cmtStatus) %>% 
      unique()

# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述

# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")


# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

可以發現本次的討論中推文數大於噓文,顯示網友們對於收到捐贈的國外疫苗有正面興趣

5.5.4 使用者的importance

V(reviewNetwork)$size <- strength(reviewNetwork)
par(mar=c(0,0,0,0))
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

# taking the log to improve it

V(reviewNetwork)$size <- log(strength(reviewNetwork))*4+3
par(mar=c(0,0,0,0))
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

由黃色節點(文章被推爆)及淺藍色節點(回文數最多)觀察社群中的意見領袖
以發文者的意見領袖有LionKing,回文數的意見領袖有Wojnarowski

5.5.5 多種圖樣一次滿足

par(mfrow=c(2, 3), mar=c(0,0,1,0))
plot(reviewNetwork, edge.width=3, vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),layout=layout_randomly, main="Random")
plot(reviewNetwork, vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),edge.width=3,layout=layout_in_circle, main="Circle")
plot(reviewNetwork, vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),edge.width=3,layout=layout_as_star, main="Star")
plot(reviewNetwork, vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),edge.width=3,layout=layout_as_tree, main="Tree")
plot(reviewNetwork, vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),edge.width=3,layout=layout_on_grid, main="Grid")
plot(reviewNetwork, vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),edge.width=3,layout=layout_with_fr, main="Force-directed")

推文數大於噓文數的視覺化呈現更加明顯

5.6 補充:networkD3

需要設定每個節點的id,記得要從0開始
各發文者間的關係

#install.packages("networkD3")
library(networkD3)
## Warning: package 'networkD3' was built under R version 4.0.5
links = link
nodes = filtered_user
nodes$id = 0:(length(nodes$user) - 1)

# 整理資料格式
nodes_complete <- data.frame(nodeID = unique(c(links$cmtPoster, links$artPoster)))
nodes_complete$group <- nodes$type[match(nodes_complete$nodeID, nodes$user)]
links$source <- match(links$cmtPoster, nodes_complete$nodeID) - 1
links$target <- match(links$artPoster, nodes_complete$nodeID) - 1

# 畫圖
library(networkD3)
forceNetwork(Links = links, Nodes = nodes_complete, Source = "source", 
             Target = "target", NodeID = "nodeID", Group = "group", 
             opacity = 0.8, fontSize = 10, zoom = TRUE,legend = TRUE, opacityNoHover = TRUE,
             
             colourScale = "d3.scaleOrdinal(d3.schemeCategory10);",
             linkColour = ifelse(links$cmtStatus == "推", "palegreen","lightcoral")  
             )
## Links is a tbl_df. Converting to a plain data frame.

6.總結

文字結論 1.日本捐疫苗的討論、美國參議完來臺宣布美國捐疫苗的討論重點有哪些?
2021-06-04 ~2021-06-07收集的文章,大概可以分成國產疫苗、美日國外疫苗捐贈 這兩種,其他還有著重討論總統感謝文、其他國家生產疫苗相關的討論等10種。

2.討論文章的聲量隨特定時間點可看出什麼趨勢?
6月3日晚間美國白宮宣布疫苗捐贈計畫且名單包含臺灣,同時日本當地確定贈送疫 苗予台灣,因此6月3日對於討論美日捐贈疫苗聲量爆衝,而至隔(4)日日本捐贈AZ 疫苗抵達臺灣,同時美國在台協會就美國全球疫苗分配之聲明議題亦提及捐贈臺灣 仍維持高聲量。 但觀察6月5日主題7稍微下降,是因為主題7係討論國外捐贈疫苗,主題10則是針 對於美國在疫苗上扮演角色,例如:美國爸爸、軍機等較為政治性討論,因此分散 主題7之討論聲量。

3.討論捐贈疫苗的社群網路如何分布?
從社群網路觀察發現,國產疫苗及國外疫苗兩邊的貼文討論聲量都很高,且兩者間 有交集。

4.討論美日捐贈疫苗的意見領袖有誰?網友的推噓狀態如何?
在八卦版上,以國外疫苗討論為主的意見領袖有LionKing,網友大多正面推文。 另外回文數最多的應該也可視為社群中心如Wojnarowski。