從發文數最多的四天來看 2019-08-12 2019-10-04 2019-10-22 2020-05-26 並且挑選出回文次數>150次帳號
packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(tidyr)
library(tidytext)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(topicmodels)
library(stringr)
library(ggplot2)
posts <- read_csv("./HongKong_articleMetaData.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
postsmost4 <- posts %>%
filter(artDate == as.Date("2019-08-12") |
artDate == as.Date("2019-10-04") |
artDate == as.Date("2019-10-22") |
artDate == as.Date("2020-05-26"))
reviews <- read_csv("./HongKong_articleReviews.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## cmtPoster = col_character(),
## cmtStatus = col_character(),
## cmtDate = col_datetime(format = ""),
## cmtContent = col_character()
## )
reviewsmost4 <- reviews %>%
filter(artDate == as.Date("2019-08-12") |
artDate == as.Date("2019-10-04") |
artDate == as.Date("2019-10-22") |
artDate == as.Date("2020-05-26"))
reviewer_count <- reviewsmost4 %>%
group_by(cmtPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
reviewer_more150 <- reviewer_count %>%
filter(count >= 150)
reviewsmost4 <- reviewsmost4 %>%
filter(reviewsmost4$cmtPoster %in% reviewer_more150$cmtPoster)
reviewsmost4 <- reviewsmost4 %>%
select(artUrl, cmtPoster, cmtStatus, cmtContent)
allPoster <- c(postsmost4$artPoster, reviewsmost4$cmtPoster)
length(unique(allPoster))
## [1] 727
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%postsmost4$artPoster, "poster", "replyer"))
# 把原文與回覆依據artUrl innerJoin起來
posts_Reviews <- merge(x = postsmost4, y = reviewsmost4, by = "artUrl")
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- posts_Reviews %>%
select(cmtPoster, artPoster, artUrl)
reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork
## IGRAPH 471a399 DN-- 290 2504 --
## + attr: name (v/c), artUrl (e/c)
## + edges from 471a399 (vertex names):
## [1] WTF55665566->jimgene WTF55665566->jimgene WTF55665566->jimgene
## [4] WTF55665566->jimgene Moratti ->jimgene WTF55665566->jimgene
## [7] WTF55665566->jimgene soria ->jimgene soria ->jimgene
## [10] soria ->jimgene WTF55665566->jimgene WTF55665566->jimgene
## [13] WTF55665566->jimgene soria ->jimgene Moratti ->jimgene
## [16] soria ->jimgene soria ->jimgene soria ->jimgene
## [19] soria ->jimgene WTF55665566->kimiya1 WTF55665566->kimiya1
## [22] WTF55665566->kimiya1 Moratti ->kimiya1 WTF55665566->kimiya1
## + ... omitted several edges
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
set.seed(123)
# 用使用者的身份來區分點的顏色,如果有發文的話是金色的,只有回覆文章的則用淺藍色表示
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 顯示有超過200個關聯的使用者賬號
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=4,
vertex.label=ifelse(degree(reviewNetwork) > 200, V(reviewNetwork)$label, NA), vertex.label.font=2)