從發文數最多的四天來看 2019-08-12 2019-10-04 2019-10-22 2020-05-26 並且挑選出回文次數>150次帳號

安裝packages

packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

載入packages

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(tidyr)
library(tidytext)
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(topicmodels)
library(stringr)
library(ggplot2)

讀取文章資料

posts <- read_csv("./HongKong_articleMetaData.csv")
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )

四天的文章

postsmost4 <- posts %>% 
  filter(artDate == as.Date("2019-08-12") |
           artDate == as.Date("2019-10-04") |
           artDate == as.Date("2019-10-22") |
           artDate == as.Date("2020-05-26"))

讀取回覆資料

reviews <- read_csv("./HongKong_articleReviews.csv")
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   cmtPoster = col_character(),
##   cmtStatus = col_character(),
##   cmtDate = col_datetime(format = ""),
##   cmtContent = col_character()
## )

四天的回覆

reviewsmost4 <- reviews %>% 
  filter(artDate == as.Date("2019-08-12") |
           artDate == as.Date("2019-10-04") |
           artDate == as.Date("2019-10-22") |
           artDate == as.Date("2020-05-26"))

計算每個帳號回覆的次數

reviewer_count <- reviewsmost4 %>%
  group_by(cmtPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

找出回覆次數>150次的使用者

reviewer_more150 <- reviewer_count %>%
  filter(count >= 150)
reviewsmost4 <- reviewsmost4 %>%
  filter(reviewsmost4$cmtPoster %in% reviewer_more150$cmtPoster)

選取需要的欄位

reviewsmost4 <- reviewsmost4 %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)

所有參與人的人數

allPoster <- c(postsmost4$artPoster, reviewsmost4$cmtPoster)
length(unique(allPoster))
## [1] 727

整理所有參與人

# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%postsmost4$artPoster, "poster", "replyer"))

將原文與回覆Join起來

# 把原文與回覆依據artUrl innerJoin起來
posts_Reviews <- merge(x = postsmost4, y = reviewsmost4, by = "artUrl")

篩選欄位

# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- posts_Reviews %>%
      select(cmtPoster, artPoster, artUrl)

建立網路關係

reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork
## IGRAPH 471a399 DN-- 290 2504 -- 
## + attr: name (v/c), artUrl (e/c)
## + edges from 471a399 (vertex names):
##  [1] WTF55665566->jimgene  WTF55665566->jimgene  WTF55665566->jimgene 
##  [4] WTF55665566->jimgene  Moratti    ->jimgene  WTF55665566->jimgene 
##  [7] WTF55665566->jimgene  soria      ->jimgene  soria      ->jimgene 
## [10] soria      ->jimgene  WTF55665566->jimgene  WTF55665566->jimgene 
## [13] WTF55665566->jimgene  soria      ->jimgene  Moratti    ->jimgene 
## [16] soria      ->jimgene  soria      ->jimgene  soria      ->jimgene 
## [19] soria      ->jimgene  WTF55665566->kimiya1  WTF55665566->kimiya1 
## [22] WTF55665566->kimiya1  Moratti    ->kimiya1  WTF55665566->kimiya1 
## + ... omitted several edges
# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
set.seed(123)

# 用使用者的身份來區分點的顏色,如果有發文的話是金色的,只有回覆文章的則用淺藍色表示
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 顯示有超過200個關聯的使用者賬號
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)

V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=4,
     vertex.label=ifelse(degree(reviewNetwork) > 200, V(reviewNetwork)$label, NA),  vertex.label.font=2)