發文次數>30次的使用者 回覆次數>600次的使用者 的網路關係圖

安裝packages

packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

載入packages

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(tidyr)
library(tidytext)
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(topicmodels)
library(stringr)
library(ggplot2)
require(wordcloud2)
## Loading required package: wordcloud2
library(wordcloud)
## Loading required package: RColorBrewer

讀取文章資料

posts <- read_csv("./HongKong_articleMetaData.csv")
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )

計算每個帳號發文的次數

poster_count <- posts %>%
  group_by(artPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

找出發文次數>30次的使用者

poster_more30 <- poster_count %>%
  filter(count >= 30)
posts <- posts %>%
  filter(posts$artPoster %in% poster_more30 $artPoster)

讀取回覆資料

reviews <- read_csv("./HongKong_articleReviews.csv")
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   cmtPoster = col_character(),
##   cmtStatus = col_character(),
##   cmtDate = col_datetime(format = ""),
##   cmtContent = col_character()
## )

計算每個帳號回覆的次數

reviewer_count <- reviews %>%
  group_by(cmtPoster) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

找出回覆次數>600次的使用者

reviewer_more600 <- reviewer_count %>%
  filter(count >= 600)
reviews <- reviews %>%
  filter(reviews$cmtPoster %in% reviewer_more600$cmtPoster)

選取需要的欄位

reviews <- reviews %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)

整理所有參與人

allPoster <- c(posts$artPoster, reviews$cmtPoster)
length(unique(allPoster))
## [1] 69
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%posts$artPoster, "poster", "replyer"))

將原文與回覆Join起來

# 把原文與回覆依據artUrl innerJoin起來
posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl")

篩選欄位

# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- posts_Reviews %>%
      select(cmtPoster, artPoster, artUrl)

建立網路關係

reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork
## IGRAPH 788af05 DN-- 68 5994 -- 
## + attr: name (v/c), artUrl (e/c)
## + edges from 788af05 (vertex names):
##  [1] jma306      ->longyin     jma306      ->longyin    
##  [3] jma306      ->longyin     jma306      ->longyin    
##  [5] jma306      ->longyin     jma306      ->longyin    
##  [7] jma306      ->longyin     mudee       ->liang691206
##  [9] mudee       ->Yirgacheffe gwenwoo     ->Yirgacheffe
## [11] Moratti     ->Yirgacheffe Moratti     ->Yirgacheffe
## [13] Moratti     ->Yirgacheffe todao       ->todao      
## [15] todao       ->todao       birdy590    ->todao      
## + ... omitted several edges
# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
set.seed(123)

# 用使用者的身份來區分點的顏色,如果有發文的話是金色的,只有回覆文章的則用淺藍色表示
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
reviewNetwork <-simplify(reviewNetwork, remove.multiple = T, remove.loops = F, 
                 edge.attr.comb=c(weight="sum", type="ignore") )
target <- c("wolver","onetwogo","QQMMWA")

# 顯示有超過200個關聯的使用者賬號
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)

V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=10,
     vertex.label=ifelse(V(reviewNetwork)$label %in% target, V(reviewNetwork)$label, NA),  vertex.label.font=1)

分析onewogo

onewogo <- posts_Reviews %>% 
  filter((artPoster == "onetwogo") )%>% 
  select(artUrl,sentence,cmtPoster,artTitle)

#對文章斷詞

jieba_tokenizer <- worker(user = "HongKong_lexicon.txt", stop_word = "stop_words.txt")

hongkong_tokenizer <- function(t){
  lapply(t, function(x){
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
onewogo_tokens <- onewogo%>% unnest_tokens(word, sentence, token = hongkong_tokenizer) %>% 
  filter(nchar(word)>1)

每個詞的數量

word_count <- onewogo_tokens %>% 
  group_by(word) %>% 
  summarise(count = n()) %>% 
  filter(count>200) %>% 
  arrange(desc(count))

文字雲

wordcloud2(word_count)

可以發現onewogo討論的議題有涉及到新加坡