packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr","reshape2")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(reshape2)
# 文章資料
posts <- read_csv("滴妹飲料店_new_articleMetaData.csv")
posts
# 回覆資料
reviews <- read_csv("滴妹飲料店_new_articleReviews.csv")
reviews
# 選取需要的欄位
reviews <- reviews %>%
select(artUrl, cmtPoster, cmtStatus, cmtContent)
reviews
posts %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="blue", size=1)+
geom_vline(xintercept = as.Date('2020-05-10'),color = "red")+
theme_classic()
滴妹宣布開飲料店時間為’2020-04-04’飲料開店日期為2020-05-10,可以發現在開幕期間有較高的討論度。
length(unique(posts$artPoster))
## [1] 108
length(unique(reviews$cmtPoster))
## [1] 3440
allPoster <- c(posts$artPoster, reviews$cmtPoster)
length(unique(allPoster))
## [1] 3506
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%posts$artPoster, "poster", "replyer"))
userList
# 把原文與回覆依據artUrl innerJoin起來
posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl")
posts_Reviews
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- posts_Reviews %>%
select(cmtPoster, artPoster, artUrl)
link
reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork
## IGRAPH 4fca9c5 DN-- 3506 6484 --
## + attr: name (v/c), artUrl (e/c)
## + edges from 4fca9c5 (vertex names):
## [1] tomshiou ->wuyilinn bushcorpese ->wuyilinn mij ->wuyilinn
## [4] jim12441 ->wuyilinn Beanoodle ->wuyilinn cs09312 ->wuyilinn
## [7] netsphere ->wuyilinn crossdunk ->wuyilinn crossdunk ->wuyilinn
## [10] leo0873 ->wuyilinn crossdunk ->wuyilinn double2783 ->wuyilinn
## [13] nakayamayyt ->wuyilinn ian15987 ->wuyilinn fly0204 ->wuyilinn
## [16] Informatik ->wuyilinn AVR0 ->wuyilinn safah ->wuyilinn
## [19] mario2000 ->wuyilinn cena0605 ->VVizZ Chia2323 ->VVizZ
## [22] ghostl40809 ->VVizZ tetsuya0310 ->VVizZ Lailungsheng->VVizZ
## + ... omitted several edges
# 畫出網路圖
plot(reviewNetwork)
可以發現密密麻麻的東西,完全無法從圖中獲得資訊。
# 把點點的大小和線的粗細調小,並不顯示使用者帳號。
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,vertex.label=NA)
還是無法看出資訊,我們縮小文章數量試試看。
link <- posts_Reviews %>%
filter(artDate == as.Date('2020-05-10')) %>%
select(cmtPoster, artPoster, artUrl) %>%
unique()
link
# 這邊要篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
filtered_user
set.seed(500)
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)
可以稍微看出圖中的點(人)之間有一定的關聯,不過目前只有單純圖形我們無法分析其中的內容。
因此將資料集中的其他資訊加到我們的圖片中。
set.seed(487)
# 用使用者的身份來區分點的顏色,如果有發文的話是金色的,只有回覆文章的則用淺藍色表示
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)
set.seed(487)
# 篩選要顯示出的使用者,以免圖形被密密麻麻的文字覆蓋
# 顯示有超過5個關聯的使用者賬號
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.2,
vertex.label=ifelse(degree(reviewNetwork) > 5, V(reviewNetwork)$label, NA), vertex.label.font=2)
我們可以看到基本的使用者關係,但是我們希望能夠將更進階的資訊視覺化。
例如:使用者經常參與的文章種類,或是使用者在該社群網路中是否受到歡迎。
# 文章斷句
drink_meta <- posts %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
drink_sentences <- strsplit(drink_meta$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
drink_sentences <- data.frame(
artUrl = rep(drink_meta$artUrl, sapply(drink_sentences, length)),
sentence = unlist(drink_sentences)
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
drink_sentences$sentence <- as.character(drink_sentences$sentence)
drink_sentences
## 文章斷詞
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
# tokenize function
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
tokens <- drink_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word) %>%
rename(count=n)
tokens
## 清理斷詞結果
# 挑出總出現次數大於3的字
reserved_word <- tokens %>%
group_by(word) %>%
count() %>%
filter(n > 3) %>%
unlist()
drink_removed <- tokens %>%
filter(word %in% reserved_word)
drink_dtm <- drink_removed %>% cast_dtm(artUrl, word, count)
drink_dtm
## <<DocumentTermMatrix (documents: 143, terms: 283)>>
## Non-/sparse entries: 1955/38514
## Sparsity : 95%
## Maximal term length: 4
## Weighting : term frequency (tf)
# LDA分成3個主題
drink_lda <- LDA(drink_dtm, k = 3, control = list(seed = 1234))
# 看各群的常用詞彙
tidy(drink_lda, matrix = "beta") %>%
filter(! term %in% c("滴妹","再睡五分鐘","飲料店")) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered() +
theme(text = element_text(family='STHeitiTC-Light'))
可以歸納出
topic 1 = “飲料店加盟”
topic 2 = “相關性比較(滴妹和其他女youtuber或是飲料店和其他手搖飲相比)”
topic 3 = “滴妹飲料店開幕”
以下我們挑出第一個主題與第二個主題來做比較。
# 使用LDA分類每篇文章的主題
drink_topics <- tidy(drink_lda, matrix="gamma") %>%
group_by(document) %>%
top_n(1, wt=gamma)
drink_topics
# 把文章資訊和主題join起來
posts_Reviews <- merge(x = posts_Reviews, y = drink_topics, by.x = "artUrl", by.y="document")
posts_Reviews
# 挑選出2020/05/09後的文章,
# 篩選有在3篇以上文章回覆者,
# 文章主題歸類為1(飲料店加盟)與2(相關性比較)者,
# 欄位只取:cmtPoster(評論者), artPoster(發文者), artUrl(文章連結), topic(主題)
link <- posts_Reviews %>%
filter(artDate > as.Date('2020-05-09')) %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>3) %>%
ungroup() %>%
filter(topic == 1 | topic == 2) %>%
select(cmtPoster, artPoster, artUrl, topic) %>%
unique()
link
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
filtered_user
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=1, edge.width=1, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 2, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
par(family='STHeitiTC-Light')
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("飲料店加盟","相關性比較"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)
可以看出帳號“VVizZ”和“T3T”和“a205090a”和“disgusting”所發的文多在討論『飲料店加盟』的主題,
帳號“Hertzfeld”和“gn01765288”和“kuninaka”等所發的文多在討論『相關性比較』的主題。
#green
posts %>%
filter(artDate > as.Date('2020-05-09')) %>%
filter( artPoster %in% c("VVizZ","T3T","a205090a","disgusting"))
#red
filtered_user %>%
filter(user %in% c("Hertzfeld","gn01765288","kuninaka")) %>%
arrange(desc(type))
posts %>%
filter(artDate > as.Date('2020-05-09')) %>%
filter( artPoster %in% c("Hertzfeld","gn01765288","kuninaka"))
可以看出帳號“VVizZ”,“T3T”所發的文多在延伸討論關於飲料店背後的利益關係,
而帳號“Hertzfeld”,“gn01765288”,“kuninaka”所發的文多在討論和其他藝人開店做相比。
# PTT的回覆有三種,推文、噓文、箭頭
# 我們只要看推噓就好,因此把箭頭清掉
link <- posts_Reviews %>%
filter(cmtStatus!="→") %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>1) %>%
ungroup() %>%
select(cmtPoster, artPoster, artUrl, cmtStatus) %>%
unique()
# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=1, edge.width=1, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 3, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
par(family='STHeitiTC-Light')
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)
帳號“d841129”,“Hertzfeld”,“VVizZ”與推文較相關。帳號“T3T”,“love0504”與噓文較相關。
#red
posts %>%
filter(artDate > as.Date('2020-05-09')) %>%
filter( artPoster %in% c("gn01765288","T3T","juyhnmki","d841129"))
reviews %>%
filter( cmtPoster %in% c("love0504"))
文章大多推的鄉民較多,噓文以比較相關的文章有激烈的討論與較多的不認同,其中以“love0504”為噓文橋梁(回覆者)。
結語:
使用network的視覺化可以讓我們對於網路中的使用者及他們在這個網路中可能扮演的角色有更多地瞭解。