系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] ""

安裝需要的packages

packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr","plotly")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

讀進library

library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(reshape2)
library(wordcloud2)
library(plotly)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

資料基本介紹

  • 資料來源: 文字平台收集PTT八卦版文章、回覆
  • 資料集: japan_vac.csv、japan_vac_rev.csv
  • 關鍵字:日本、疫苗
  • 資料時間:2021-05-19~2021-06-04

1. 資料前處理

在本篇分析中,我們希望建構特定議題的社群網路圖,並分析網路中討論的議題主題

我們需要兩種資料: (1) 每篇文章的主題分類(LDA) (2) 社群網路圖的link和nodes

載入文章和網友回覆資料

posts <- read_csv("./data/japan_vac.csv") %>% # 文章 8249 
  mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>% 
  mutate(sentence=gsub("\n", "", sentence)) %>%  #換行符號 
  mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence)) %>%  #有url的取代掉
  mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|最後更新", "", sentence))
  
reviews <- read_csv("./data/japan_vac_rev.csv") # 回覆 603476 

 head(posts)
## # A tibble: 6 x 10
##   artTitle   artDate    artTime  artUrl  artPoster artCat commentNum  push   boo
##   <chr>      <date>     <time>   <chr>   <chr>     <chr>       <dbl> <dbl> <dbl>
## 1 [問卦]美~  2021-05-19 16:07:44 https:~ faycos    Gossi~         47    22     4
## 2 [問卦]是~  2021-05-19 16:13:44 https:~ kevin195~ Gossi~         54    17    11
## 3 [問卦]以~  2021-05-19 16:13:52 https:~ fraterni~ Gossi~         38    13     9
## 4 [問卦]周~  2021-05-19 16:14:11 https:~ WARgame7~ Gossi~         26    14     3
## 5 [問卦]是~  2021-05-19 16:39:34 https:~ didi0909  Gossi~          4     2     0
## 6 [問卦]所~  2021-05-19 16:44:55 https:~ Skynet55~ Gossi~        910   305    34
## # ... with 1 more variable: sentence <chr>
 head(reviews)
## # A tibble: 6 x 10
##   artTitle   artDate    artTime  artUrl     artPoster artCat cmtPoster cmtStatus
##   <chr>      <date>     <time>   <chr>      <chr>     <chr>  <chr>     <chr>    
## 1 [問卦]美~  2021-05-19 16:07:44 https://w~ faycos    Gossi~ qazwsx01~ →        
## 2 [問卦]美~  2021-05-19 16:07:44 https://w~ faycos    Gossi~ blueskyq~ →        
## 3 [問卦]美~  2021-05-19 16:07:44 https://w~ faycos    Gossi~ waijr     推       
## 4 [問卦]美~  2021-05-19 16:07:44 https://w~ faycos    Gossi~ yangwn12~ 推       
## 5 [問卦]美~  2021-05-19 16:07:44 https://w~ faycos    Gossi~ god2      噓       
## 6 [問卦]美~  2021-05-19 16:07:44 https://w~ faycos    Gossi~ sali921   推       
## # ... with 2 more variables: cmtDate <dttm>, cmtContent <chr>

2.LDA 主題分類

文章斷詞

jieba_tokenizer = worker(user="./dict/jpvac_dict.txt", stop_word = "./dict/stop_words.txt")

news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

斷詞結果可以先存起來,就不用再重跑一次

 tokens <- posts %>%
   mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
    mutate(sentence = gsub("[0-9b-yB-Y]", "",sentence)) %>%
   unnest_tokens(word, sentence, token=news_tokenizer) %>%
   count(artUrl, word) %>%
   rename(count=n)
 tokens
## # A tibble: 421,415 x 3
##    artUrl                                                   word   count
##    <chr>                                                    <chr>  <int>
##  1 https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html az         1
##  2 https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html 一直       1
##  3 https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html 一種       1
##  4 https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html 下單       1
##  5 https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html 口罩       1
##  6 https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html 中國       2
##  7 https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html 中國隊     1
##  8 https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html 以上       1
##  9 https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html 加入       1
## 10 https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html 台灣       1
## # ... with 421,405 more rows
 save.image(file = "./data/jpvac_token_result.rdata")
load("./data/jpvac_token_result.rdata")

清理斷詞結果

。根據詞頻,選擇只出現3字以上的字
。整理成url,word,n的格式之後,就可以轉dtm

P.S. groupby by之後原本的字詞結構會不見,把詞頻另存在一個reserved_word裡面

# 依據字頻挑字
reserved_word <- tokens %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > 3) %>% #詞頻>3的詞彙
  unlist()

jpvac_removed <- tokens %>% 
  filter(word %in% reserved_word) #tokens保留>3的word 

#reef_dtm 裡面 nrow:幾篇文章 ; ncol:幾個字, 轉成dtm 
jpvac_dtm <- jpvac_removed %>% cast_dtm(artUrl, word, count) 

(2) LDA 主題分析

將剛處理好的dtm放入LDA函式分析

# LDA分成5個主題
# jpvac_lda <- LDA(jpvac_dtm, k = 5, control = list(seed = 123)) 
jpvac_lda <- LDA(jpvac_dtm, k =5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs")

p.s. 。tidy(jpvac_lda, matrix = “beta”) # 取字 topic term beta值 。tidy(jpvac_lda, matrix=“gamma”) # 取主題 document topic gamma

取出代表字詞(term)

removed_word = c("疫苗","裡面","不是","每天","一直","aa") 

jpvac_term_topic<- tidy(jpvac_lda, matrix = "beta") 
# 看各群的常用詞彙
tidy(jpvac_lda, matrix = "beta") %>% # 取出topic term beta值
  filter(! term %in% removed_word) %>% 
  group_by(topic) %>%
  top_n(15, beta) %>% # beta值前15的字
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

可以歸納出

phi_m <- jpvac_term_topic %>% arrange(desc(beta)) %>% top_n(80) #取前100 beta最大值 
## Selecting by beta
dtm <-phi_m %>% cast_dtm(topic, term, beta)

dtmm<-as.matrix(dtm)
dim(dtmm) 
## [1]  5 71
network=graph_from_incidence_matrix(dtmm)

# plot
set.seed(3)
plot(network, ylim=c(-1,1), xlim=c(-1,1), asp = 0,
     vertex.label.cex=0.7, vertex.size=10, vertex.label.family = "Heiti TC Light")
## Warning in text.default(x, y, labels = labels, col = label.color, family =
## label.family, : font family not found in Windows font database

各topic的主要字詞,其中az、政府與2個topic均有關連

取出代表主題(topic)

每篇文章拿gamma值最大的topic當該文章的topic

# 在tidy function中使用參數"gamma"來取得 theta矩陣 
jpvac_doc_topic<- tidy(jpvac_lda, matrix = "gamma")  
jpvac_topics <- tidy(jpvac_lda, matrix="gamma") %>% # document topic gamma
                  group_by(document) %>%
                  top_n(1, wt=gamma) #gamma值最大者 

# jpvac_topics %>% filter(gamma>0.5) %>% arrange(desc(gamma))   #551筆 

table(jpvac_topics$topic ) #各主題的分佈情形 
## 
##    1    2    3    4    5 
## 1372 3172 1396 1264 1749

資料內容探索

posts_topic <- merge(x = posts, y = jpvac_topics, by.x = "artUrl", by.y="document") #增加post的Url 

# 看一下各主題在說甚麼
set.seed(123)
 t1<-
 posts_topic %>% # 主題一
   filter(topic==1) %>%
   select(artTitle,sentence,artDate) %>%
   unique()
 t2<-
 posts_topic %>% # 主題一
   filter(topic==2) %>%
   select(artTitle,sentence,artDate) %>%
   unique()
 t3<-
 posts_topic %>% # 主題一
   filter(topic==3) %>%
   select(artTitle,sentence,artDate) %>%
   unique()
 t4<-
 posts_topic %>% # 主題一
   filter(topic==4) %>%
   select(artTitle,sentence,artDate) %>%
   unique()
 t5<-
 posts_topic %>% # 主題一
   filter(topic==5) %>%
   select(artTitle,sentence,artDate) %>%
   unique()


# posts_topic %>% # 主題一
#   filter(topic==1) %>%
#   select(artTitle,sentence,artDate) %>%
#   unique() %>%
#   sample_n(20)
 
# posts_topic %>% # 主題二
#   filter(topic==2) %>%
#   select(artTitle,sentence) %>%
#   unique() %>%
#   sample_n(20)
# 
# posts_topic %>% # 主題三
#   filter(topic==3) %>%
#    select(artTitle,sentence) %>%
#   unique() %>%
#   sample_n(20)
 
# posts_topic %>% # 主題四
#   filter(topic==4) %>%
#   select(artTitle,sentence) %>%
#   unique() %>%
#   sample_n(20)
 
# posts_topic %>% # 主題五
#   filter(topic==5) %>%
#    select(artTitle,sentence) %>%
#   unique() %>%
#   sample_n(20)

為各主題命名:

主題一:日本送疫苗之後,接種相關議題,如「AZ疫苗打了真的會死嗎 」、「疫苗接種要自費,有辦法達到群體免疫嗎」
主題二:疫苗的八卦議題,如「疫苗跟顯卡哪個比較難搶?」、「所以到底誰在擋我們買疫苗?」、「為什麼我們買不到疫苗有些人很爽啊?」
主題三:日本贈台疫苗相關討論 主題多為討論日本為何要送我們疫苗,背後的出發點為何, 「日本疫苗來台最氣的是誰?」、「疫苗哪國代理有差嗎?」、「日本是不是很陰險」
主題四:政府購買疫苗議題,如「「指揮中心不願買3000萬劑BNT疫苗」是真的」、「蔡:疫苗購買須由中央統籌」、「世界有企業或善心團體買到疫苗的嗎?」
主題五:國產疫苗議題討論,如「沒做三期的疫苗打了最慘會出現什麼狀況?」、「如果明天宣布打國產疫苗,你敢打嗎?」、「高端疫苗重訊!!!」

日期主題分布

畫出每月topic的分布,在5月底爆量討論 在各議題的討論分布上,主題五的比例逐漸下降;主題三有逐漸上升的趨勢。
主題二:疫苗的八卦議題
主題三:日本贈台疫苗相關討論,因日本疫苗真的抵台,導致議題大量被討論
主題五:國產疫苗議題討論

#看發文數分布
posts %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="gray")+
  ggtitle("發文數分布") + xlab("artDate") +
  scale_x_date(date_breaks="2 days", date_labels="%m-%d")+
    geom_point(aes(colour = count)) -> p
ggplotly(p)
#看每月的發文數  
posts_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  count(artDate= format(artDate,'%m%d'),topic) %>%
  ggplot(aes(x= artDate,y=n,fill=as.factor(topic))) +
  geom_col() + 
  theme_void()

#依每月topic的分佈,以比例呈現
posts_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  group_by(artDate= format(artDate,'%m%d'),topic) %>%
  summarise(sum =sum(topic)) %>%
  ggplot(aes(x= artDate,y=sum,fill=as.factor(topic))) +
  geom_col(position="fill") -> pt
ggplotly(pt)

4. 社群網路圖

資料合併

# 文章和留言
reviews <- reviews %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)

posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl") #用artUrl merge posts 和reviews 

# 把文章和topic
posts_Reviews <- merge(x = posts_Reviews, y = jpvac_topics, by.x = "artUrl", by.y="document")

取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位

link <- posts_Reviews %>% select(cmtPoster, artPoster, artUrl)
head(link) #欄位的順序有關係 
##    cmtPoster artPoster                                                   artUrl
## 1 qazwsx0128    faycos https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html
## 2 blueskyqoo    faycos https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html
## 3      waijr    faycos https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html
## 4 yangwn1234    faycos https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html
## 5       god2    faycos https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html
## 6    sali921    faycos https://www.ptt.cc/bbs/Gossiping/M.1621440466.A.6CB.html

基本網路圖

建立網路關係

reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork
## IGRAPH f4bdd01 DN-- 41858 634828 -- 
## + attr: name (v/c), artUrl (e/c)
## + edges from f4bdd01 (vertex names):
##  [1] qazwsx0128->faycos blueskyqoo->faycos waijr     ->faycos yangwn1234->faycos
##  [5] god2      ->faycos sali921   ->faycos puritylife->faycos jjlee     ->faycos
##  [9] tomchow76 ->faycos IEhacker  ->faycos orzooozro ->faycos orzooozro ->faycos
## [13] tdjpl     ->faycos gn01657736->faycos jbking    ->faycos badguy666 ->faycos
## [17] neos042   ->faycos soyghcg   ->faycos soyghcg   ->faycos soyghcg   ->faycos
## [21] soyghcg   ->faycos soyghcg   ->faycos HyperPoro ->faycos lolancelot->faycos
## [25] MacOSX10  ->faycos MacOSX10  ->faycos didi0909  ->faycos windyyw   ->faycos
## [29] Makubex82 ->faycos takamiku  ->faycos copyer    ->faycos copyer    ->faycos
## + ... omitted several edges

直接畫的話,因為點沒有經過篩選,看起來會密密麻麻的還需要經過一次資料篩選

# 畫出網路圖(密集恐懼警告)
#plot(reviewNetwork)
#plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,vertex.label=NA)

資料篩選

資料篩選的方式:

  • 文章:文章日期、留言數(commentNum)
  • link、node:degree
# 看一下留言數大概都多少(方便後面篩選)
# 留言數最高有到1500多個留言 
posts %>%
  filter(commentNum > 100) %>%
  ggplot(aes(x=commentNum)) + geom_histogram() 

依據發文數或回覆數篩選post和review

# 帳號發文篇數 , 發文數都在30篇以下
post_count = posts %>%
  group_by(artPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
post_count
## # A tibble: 3,642 x 2
##    artPoster   count
##    <chr>       <int>
##  1 lpbrother      26
##  2 whiteadam      24
##  3 Makubex82      23
##  4 CavendishJr    22
##  5 douge          22
##  6 jimgene        22
##  7 A6             20
##  8 cloud72426     19
##  9 jerrylin       19
## 10 kaky           19
## # ... with 3,632 more rows
# 帳號回覆總數
review_count = reviews %>%
  group_by(cmtPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
review_count
## # A tibble: 40,948 x 2
##    cmtPoster    count
##    <chr>        <int>
##  1 trywish       1614
##  2 birdy590      1503
##  3 s9234032      1315
##  4 kissa0924307  1173
##  5 amida959       982
##  6 Ghamu          964
##  7 bathilda       962
##  8 BaRanKa        917
##  9 douge          906
## 10 nowitzki0207   851
## # ... with 40,938 more rows
# 發文者
poster_filtered <- post_count %>% filter(count >= 2) #共1497位 發文超過2篇 
posts <- posts %>%  filter(posts$artPoster %in% poster_filtered$artPoster) #找出發文超過2篇的posts

# 回覆者
reviewer_filtered <- review_count %>%  filter(count >= 20) #6688位 
reviews <- reviews %>%  filter(reviews$cmtPoster %in% reviewer_filtered$cmtPoster)
# 檢視參與人數
length(unique(posts_Reviews$artPoster)) # 發文者數量 3634
## [1] 3634
length(unique(posts_Reviews$cmtPoster)) # 回覆者數量 40948
## [1] 40948
allPoster <- c(posts_Reviews$artPoster, posts_Reviews$cmtPoster) # 總參與人數 41858 
length(unique(allPoster))
## [1] 41858

標記所有出現過得使用者

  • poster:只發過文、發過文+留過言
  • replier:只留過言
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%posts$artPoster, "poster", "replier"))
head(userList)
##         user   type
## 1     faycos poster
## 2 kevin19528 poster
## 3 fraternity poster
## 4 WARgame723 poster
## 5   didi0909 poster
## 6 Skynet5566 poster

篩選社群,留言數超過一定數量者

link <- posts_Reviews %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>3) %>% #同一篇文章留言超過3次  
      filter(commentNum > 300) %>%
      filter(artCat=="Gossiping") %>% 
      select(cmtPoster, artPoster, artUrl) %>% 
      unique()
link
## # A tibble: 8,858 x 3
## # Groups:   cmtPoster, artUrl [8,858]
##    cmtPoster    artPoster  artUrl                                               
##    <chr>        <chr>      <chr>                                                
##  1 A6           Skynet5566 https://www.ptt.cc/bbs/Gossiping/M.1621442697.A.EB1.~
##  2 kutkin       Skynet5566 https://www.ptt.cc/bbs/Gossiping/M.1621442697.A.EB1.~
##  3 fluffylove   Skynet5566 https://www.ptt.cc/bbs/Gossiping/M.1621442697.A.EB1.~
##  4 bathilda     Skynet5566 https://www.ptt.cc/bbs/Gossiping/M.1621442697.A.EB1.~
##  5 NEWOLD       Skynet5566 https://www.ptt.cc/bbs/Gossiping/M.1621442697.A.EB1.~
##  6 bnb89225     Skynet5566 https://www.ptt.cc/bbs/Gossiping/M.1621442697.A.EB1.~
##  7 EvoLancer    Skynet5566 https://www.ptt.cc/bbs/Gossiping/M.1621442697.A.EB1.~
##  8 jtsu5223     Skynet5566 https://www.ptt.cc/bbs/Gossiping/M.1621442697.A.EB1.~
##  9 sluttervagen Skynet5566 https://www.ptt.cc/bbs/Gossiping/M.1621442697.A.EB1.~
## 10 s999132      Skynet5566 https://www.ptt.cc/bbs/Gossiping/M.1621442697.A.EB1.~
## # ... with 8,848 more rows

篩選在link裡面有出現的使用者

filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
head(filtered_user,10)
##           user    type
## 1         wppw replier
## 2     lovebxcx replier
## 3        purue replier
## 4  stupidghost replier
## 5     griffick replier
## 6      aq53176 replier
## 7     DORAQMON replier
## 8       lingsk replier
## 9      skizard replier
## 10    trombone replier

p.s.想要看會變怎麼樣的人可以跑下面的code

## 警告!有密集恐懼症的人請小心使用
v = userList 
reviewNetwork <- graph_from_data_frame(d=link, v=userList, directed=T)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)

因爲圖片箭頭有點礙眼,所以這裏我們先把關係的方向性拿掉,減少圖片中的不必要的資訊 set.seed 因為igraph呈現的方向是隨機的

set.seed(487)
v=filtered_user

reviewNetwork = degree(reviewNetwork) > 25
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)

加上nodes的顯示資訊

用使用者的身份來區分點的顏色

  • poster:gold(有發文)
  • replier:lightblue(只有回覆文章)
set.seed(487)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "orange", "lightblue")
plot(reviewNetwork, vertex.size=4, edge.arrow.size=0.3,vertex.label=NA)

可以稍微看出圖中的點(人)之間有一定的關聯,不過目前只有單純圖形我們無法分析其中的內容。
因此以下我們將資料集中的資訊加到我們的圖片中。

為點加上帳號名字,用degree篩選要顯示出的使用者,以免圖形被密密麻麻的文字覆蓋

filter_degree = 100
set.seed(123)

# 設定 node 的 label/ color
labels <- degree(reviewNetwork) # 算出每個點的degree
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "orange", "lightblue")

plot(
  reviewNetwork, 
  vertex.size=4, 
  edge.width=1, 
  vertex.label.dist=1,
  vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

我們可以看到基本的使用者關係,但是我們希望能夠將更進階的資訊視覺化。
例如:使用者經常參與的文章種類,或是使用者在該社群網路中是否受到歡迎。

以主題篩選社群

主題二: 主題三: 主題五: 篩選一篇文章回覆10次以上者,且文章留言數多於200則, 欄位只取:cmtPoster(評論者), artPoster(發文者), artUrl(文章連結), topic(主題)

link <- posts_Reviews %>%
      group_by(cmtPoster, artUrl) %>% 
      filter(n()>10) %>% 
      filter(commentNum > 200) %>%
      filter(topic == "2" | topic == "3" |topic == "5" ) %>%
      select(cmtPoster, artPoster, artUrl, topic) %>% 
      unique()
av=data.frame(link$artPoster)
head(unique(av$link.artPoster))
## [1] "Skynet5566"   "wizardfizban" "wppw"         "aliceric29"   "AutoTea"     
## [6] "oz5566"
head(av$link.artPoster)
## [1] "Skynet5566" "Skynet5566" "Skynet5566" "Skynet5566" "Skynet5566"
## [6] "Skynet5566"
  • 抓nodes 在所有的使用者裡面,篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
head(filtered_user)
##           user    type
## 1         wppw replier
## 2      AutoTea replier
## 3  jasonking3c replier
## 4           F5 replier
## 5    suck55426 replier
## 6 LoveMakeLove replier

使用者經常參與的文章種類

filter_degree = 10

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork) 
# labels
V(reviewNetwork)$label <- names(labels) #label的名稱為poster,replier的PTT ID ... 
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "orange", "lightblue") #節點的顏色
# V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的"主題",對他們的關聯線進行上色
E(reviewNetwork)$color <-
  ifelse(E(reviewNetwork)$topic == "1", "palevioletred",
  ifelse(E(reviewNetwork)$topic == "2", "lightgreen", 
  ifelse(E(reviewNetwork)$topic == "3", "lightgray",   
  ifelse(E(reviewNetwork)$topic == "4", "lightblue",  
         "lightyellow"
       ) ) ))

# 畫出社群網路圖(degree>7的才畫出來)
set.seed(5432)
plot(reviewNetwork, vertex.size=4, edge.width=2, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=1)

# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("orange","lightblue"), pt.cex=1, cex=1)
# legend("topleft", c("TOPIC_1","TOPIC_2","TOPIC_3","TOPIC_4","TOPIC_5"),
#        col=c("palevioletred", "lightgreen","lightgray","lightblue",  "lightyellow"), lty=1, cex=1)

legend("topleft", c("疫苗的八卦議題","日本贈台疫苗相關討論","國產疫苗議題討論"),
       col=c("lightgreen", "lightgray","lightblue"), lty=1, cex=1)

> voshi412, akuan413, mystage主要為發文連署議題
Hyuui 為護礁與能源議題
shinmoner, sunchen0201 第三接收站開發或替代方案

使用者是否受到歡迎

八卦版 PTT的回覆有三種,推文、噓文、箭頭,我們只要看推噓就好,因此把箭頭清掉,這樣資料筆數較少,所以我們把篩選的條件放寬一些。

filter_degree = 40 # 使用者degree

# 過濾留言者對發文者的推噓程度
link <- posts_Reviews %>%
      filter(artCat=="Gossiping") %>% 
      filter(commentNum > 150) %>%
      filter(cmtStatus!="→") %>%
      group_by(cmtPoster, artUrl) %>%
      filter( n() > 2) %>%
      ungroup() %>% 
      select(artTitle,sentence,cmtContent, cmtPoster, artPoster, artUrl, cmtStatus) %>% 
      select(cmtPoster, artPoster, artUrl, cmtStatus) %>% 
      unique()

  # 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述

# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "orange", "lightblue")


# 依據回覆發生的文章所對應的推噓狀態,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(180)
plot(reviewNetwork, vertex.size=5, edge.width=1, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=1)

# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("orange","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

#2個中心帳號的PO文內容 
link %>% filter(artPoster %in% c("jasperbf","yoshiki78529")) %>% 
  inner_join(posts) %>%
  select (cmtPoster,artPoster,artTitle,cmtStatus,artUrl) 
## Joining, by = c("artPoster", "artUrl")
## # A tibble: 403 x 5
##    cmtPoster   artPoster   artTitle            cmtStatus artUrl                 
##    <chr>       <chr>       <chr>               <chr>     <chr>                  
##  1 wirewool    yoshiki785~ [新聞]衛福部:擬購~ 推        https://www.ptt.cc/bbs~
##  2 wirewool    yoshiki785~ [新聞]衛福部:擬購~ 噓        https://www.ptt.cc/bbs~
##  3 unexpect    yoshiki785~ [爆卦]集合!LIVE本~  推        https://www.ptt.cc/bbs~
##  4 KrisNYC     yoshiki785~ [爆卦]集合!LIVE本~  噓        https://www.ptt.cc/bbs~
##  5 ssaic2006   yoshiki785~ [爆卦]集合!LIVE本~  推        https://www.ptt.cc/bbs~
##  6 louiswei19~ yoshiki785~ [爆卦]集合!LIVE本~  噓        https://www.ptt.cc/bbs~
##  7 AxelGod     yoshiki785~ [爆卦]集合!LIVE本~  推        https://www.ptt.cc/bbs~
##  8 tony4417    yoshiki785~ [爆卦]集合!LIVE本~  推        https://www.ptt.cc/bbs~
##  9 chihlee5566 yoshiki785~ [爆卦]集合!LIVE本~  噓        https://www.ptt.cc/bbs~
## 10 colin8930   yoshiki785~ [爆卦]集合!LIVE本~  噓        https://www.ptt.cc/bbs~
## # ... with 393 more rows

八卦版中的討論中以推文較多、噓文較少

networkD3

需要設定每個節點的id,記得要從0開始

library(networkD3)
## Warning: package 'networkD3' was built under R version 4.0.5
links = link
nodes = filtered_user
nodes$id = 0:(length(nodes$user) - 1) #node id從0開始 

# 整理資料格式
nodes_complete <- data.frame(nodeID = unique(c(links$cmtPoster, links$artPoster)))
nodes_complete$group <- nodes$type[match(nodes_complete$nodeID, nodes$user)] #link裡的artPoster,cmtPoster有match到filtered_user的userID, 的type 取出放入group欄位 
links$source <- match(links$cmtPoster, nodes_complete$nodeID) - 1 
links$target <- match(links$artPoster, nodes_complete$nodeID) - 1

# 畫圖
library(networkD3)
forceNetwork(Links = links, Nodes = nodes_complete, Source = "source", 
             Target = "target", NodeID = "nodeID", Group = "group", 
             opacity = 0.8, fontSize = 10, zoom = TRUE,legend = TRUE, opacityNoHover = TRUE,
             
             colourScale = "d3.scaleOrdinal(d3.schemeCategory10);",
             linkColour = ifelse(links$cmtStatus == "推", "palegreen","lightcoral")  # 設定推噓顏色
             )
## Links is a tbl_df. Converting to a plain data frame.

總結