發文次數>30次的使用者 回覆次數>600次的使用者 的網路關係圖
packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(tidyr)
library(tidytext)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(topicmodels)
library(stringr)
library(ggplot2)
require(wordcloud2)
## Loading required package: wordcloud2
library(wordcloud)
## Loading required package: RColorBrewer
posts <- read_csv("./HongKong_articleMetaData.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
poster_count <- posts %>%
group_by(artPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
poster_more30 <- poster_count %>%
filter(count >= 30)
posts <- posts %>%
filter(posts$artPoster %in% poster_more30 $artPoster)
reviews <- read_csv("./HongKong_articleReviews.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## cmtPoster = col_character(),
## cmtStatus = col_character(),
## cmtDate = col_datetime(format = ""),
## cmtContent = col_character()
## )
reviewer_count <- reviews %>%
group_by(cmtPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
reviewer_more600 <- reviewer_count %>%
filter(count >= 600)
reviews <- reviews %>%
filter(reviews$cmtPoster %in% reviewer_more600$cmtPoster)
reviews <- reviews %>%
select(artUrl, cmtPoster, cmtStatus, cmtContent)
allPoster <- c(posts$artPoster, reviews$cmtPoster)
length(unique(allPoster))
## [1] 69
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%posts$artPoster, "poster", "replyer"))
# 把原文與回覆依據artUrl innerJoin起來
posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl")
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- posts_Reviews %>%
select(cmtPoster, artPoster, artUrl)
reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork
## IGRAPH 788af05 DN-- 68 5994 --
## + attr: name (v/c), artUrl (e/c)
## + edges from 788af05 (vertex names):
## [1] jma306 ->longyin jma306 ->longyin
## [3] jma306 ->longyin jma306 ->longyin
## [5] jma306 ->longyin jma306 ->longyin
## [7] jma306 ->longyin mudee ->liang691206
## [9] mudee ->Yirgacheffe gwenwoo ->Yirgacheffe
## [11] Moratti ->Yirgacheffe Moratti ->Yirgacheffe
## [13] Moratti ->Yirgacheffe todao ->todao
## [15] todao ->todao birdy590 ->todao
## + ... omitted several edges
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
set.seed(123)
# 用使用者的身份來區分點的顏色,如果有發文的話是金色的,只有回覆文章的則用淺藍色表示
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
reviewNetwork <-simplify(reviewNetwork, remove.multiple = T, remove.loops = F,
edge.attr.comb=c(weight="sum", type="ignore") )
target <- c("wolver","onetwogo","QQMMWA")
# 顯示有超過200個關聯的使用者賬號
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=10,
vertex.label=ifelse(V(reviewNetwork)$label %in% target, V(reviewNetwork)$label, NA), vertex.label.font=1)
onewogo <- posts_Reviews %>%
filter((artPoster == "onetwogo") )%>%
select(artUrl,sentence,cmtPoster,artTitle)
#對文章斷詞
jieba_tokenizer <- worker(user = "HongKong_lexicon.txt", stop_word = "stop_words.txt")
hongkong_tokenizer <- function(t){
lapply(t, function(x){
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
onewogo_tokens <- onewogo%>% unnest_tokens(word, sentence, token = hongkong_tokenizer) %>%
filter(nchar(word)>1)
word_count <- onewogo_tokens %>%
group_by(word) %>%
summarise(count = n()) %>%
filter(count>200) %>%
arrange(desc(count))
wordcloud2(word_count)
可以發現onewogo討論的議題有涉及到新加坡