分析PTT八卦版對台灣停電事件的文字資料和社會網絡資料
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
[1] ""
= c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr","text2vec")
packages = as.character(installed.packages()[,1])
existing for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(reshape2)
library(wordcloud2)
require(text2vec)
<- read_csv("0612-1_articleMetaData.csv") # 文章
posts <- read_csv("0612-1_articleReviews.csv") # 回覆
reviews <- read_csv("0612-1_artWordPOSFreq.csv")
rd head(posts)
head(reviews)
head(rd)
# # 文章斷句("\n\n"取代成"。")
<- posts %>%
mask_meta mutate(sentence=gsub("[\n]{2,}", "。", sentence))
#
# # 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
<- strsplit(mask_meta$sentence,"[。!;?!?;]")
mask_sentences #
# # 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
<- data.frame(
mask_sentences artUrl = rep(mask_meta$artUrl, sapply(mask_sentences, length)),
sentence = unlist(mask_sentences)
%>%
) filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
# 如果有\t或\n就去掉
$sentence <- as.character(mask_sentences$sentence)
mask_sentences mask_sentences
# 加入自定義的字典
<- worker(user="user_dict.txt", stop_word = "stop_words.txt")
jieba_tokenizer
# 設定斷詞function
<- function(t) {
chi_tokenizer lapply(t, function(x) {
if(nchar(x)>1){
<- segment(x, jieba_tokenizer)
tokens # 去掉字串長度爲1的詞彙
<- tokens[nchar(tokens)>1]
tokens return(tokens)
}
})
}
<- posts %>%
rd_tokens_all unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
# # 用剛剛初始化的斷詞器把sentence斷開
<- mask_sentences %>%
tokens mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
count(artUrl, word) %>% # 計算每篇文章出現的字頻
rename(count=n)
tokens
# save.image(file = "../data/token_result.rdata")
= 3
freq # 依據字頻挑字
<- tokens %>%
reserved_word group_by(word) %>%
count() %>%
filter(n > freq) %>%
unlist()
<- tokens %>%
mask_removed filter(word %in% reserved_word)
#mask_dtm 裡面 nrow:幾篇文章 ; ncol:幾個字
<- mask_removed %>% cast_dtm(artUrl, word, count) mask_dtm
<- rd %>%
data ::select(artDate, artUrl) %>%
dplyrdistinct()
<- data %>%
article_count_by_date group_by(artDate) %>%
summarise(count = n())
head(article_count_by_date, 20)
%>%
posts mutate(artDate = as.Date(artDate)) %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="red")+
geom_point()
<- rd %>%
data group_by(word) %>%
summarise(sum = sum(count), .groups = 'drop') %>%
arrange(desc(sum))
%>% filter(sum > 50) %>% wordcloud2() data
# 正向字典txt檔
# 以,將字分隔
<- read_file("positive.txt")
P
# 負向字典txt檔
<- read_file("negative.txt")
N
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
= strsplit(P, ",")[[1]]
P = strsplit(N, ",")[[1]]
N
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
= data.frame(word = P, sentiment = "positive")
P = data.frame(word = N, sentiment = "negative")
N = rbind(P, N)
LIWC
<- rd %>%
rd_tokens select(-artTime, -artUrl)
head(rd_tokens)
<- rd_tokens %>%
rd_tokens_by_date count(artDate, word, sort = TRUE) %>%
filter(n > 5)
rd_tokens_by_date
%>%
rd_tokens_by_date inner_join(LIWC) %>%
select(word) %>%
inner_join(LIWC)
Joining, by = "word"
Joining, by = "word"
= rd_tokens_by_date %>%
sentiment_count select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))
Joining, by = "word"
`summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
%>%
sentiment_count ggplot() +
geom_line(aes(x=artDate,y=count,colour=sentiment)) +
labs(x=NULL,y="數量")
%>%
rd_tokens_all filter(artDate == as.Date("2021-05-13") |
== as.Date("2021-05-14") |
artDate == as.Date("2021-05-15") |
artDate == as.Date("2021-05-16") |
artDate == as.Date("2021-05-17") |
artDate == as.Date("2021-05-18") ) %>%
artDate inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
%>% data.frame() %>%
) top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
labs(x= "文字", y="數量") +
facet_wrap(~sentiment, scales = "free_y") +
theme(text=element_text(size=14))+
coord_flip()
Joining, by = "word"
`summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
= c()
ldas = c(2,4,6,10,15)
topics for(topic in topics){
<- Sys.time()
start_time <- LDA(mask_dtm, k = topic, control = list(seed = 2021))
lda =c(ldas,lda)
ldas print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
}
[1] "2 topic(s) and use time is 2.04346084594727"
[1] "4 topic(s) and use time is 7.30365300178528"
[1] "6 topic(s) and use time is 16.5211598873138"
[1] "10 topic(s) and use time is 27.2711720466614"
[1] "15 topic(s) and use time is 50.7416360378265"
透過perplexity找到最佳主題數
library(purrr)
Attaching package: 愼㸱愼㸵purrr愼㸱愼㸶
The following objects are masked from 愼㸱愼㸵package:igraph愼㸱愼㸶:
compose, simplify
= c(2,4,6,10,15)
topics data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
`data_frame()` was deprecated in tibble 1.1.0.
Please use `tibble()` instead.
library(udpipe)
<- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtf <- document_term_matrix(x = dtf)
dtm <- dtm_remove_lowfreq(dtm, minfreq = 30)
dtm_clean dim(dtm_clean)
set.seed(2021)
= 6
topic_n
=text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
lda_model =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)
doc_topic_distr $get_top_words(n = 10, lambda = 0.5)
lda_model$plot()
lda_model$plot(out.dir ="lda_result", open.browser = TRUE) lda_model
將剛處理好的dtm放入LDA函式分析
# LDA分成6個主題
<- LDA(mask_dtm, k = 6, control = list(seed = 123)) mask_lda
取出代表字詞(term)
#removed_word = c("不是","每天","出來","覺得")
= c("一下","不是","停電","有沒有","不會","發電")
removed_word # 看各群的常用詞彙
tidy(mask_lda, matrix = "beta") %>% # 取出topic term beta值
filter(! term %in% removed_word) %>%
group_by(topic) %>%
top_n(10, beta) %>% # beta值前10的字
ungroup() %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
可以歸納出 + topic 1 = “能源政策” + topic 2 = “抱怨政府” + topic 3 = “政治人物議題” + topic 4 = “如何讓供電穩定” + topic 5 = “嘲諷下次停電時間” + topic 6 = “停電原因究責” 以下我們挑出第一個主題、第二個主題和第六個主題來做比較。
取出代表主題(topic)
# 在tidy function中使用參數"gamma"來取得 theta矩陣
<- tidy(mask_lda, matrix="gamma") %>% # document topic gamma
mask_topics group_by(document) %>%
top_n(1, wt=gamma)
mask_topics
<- merge(x = posts, y = mask_topics, by.x = "artUrl", by.y="document")
posts_topic
# 看一下各主題在說甚麼
set.seed(12345)
%>% # 主題一
posts_topic filter(topic==1) %>%
select(artTitle) %>%
unique() %>%
sample_n(12)
%>% # 主題二
posts_topic filter(topic==2) %>%
select(artTitle) %>%
unique() %>%
sample_n(12)
%>% # 主題三
posts_topic filter(topic==3) %>%
select(artTitle) %>%
unique() %>%
sample_n(12)
%>% # 主題四
posts_topic filter(topic==4) %>%
select(artTitle) %>%
unique() %>%
sample_n(12)
%>% # 主題五
posts_topic filter(topic==5) %>%
select(artTitle) %>%
unique() %>%
sample_n(12)
%>% # 主題六
posts_topic filter(topic==6) %>%
select(artTitle) %>%
unique() %>%
sample_n(12)
%>%
posts_topic mutate(artDate = as.Date(artDate)) %>%
group_by(artDate,topic) %>%
summarise(sum =sum(topic)) %>%
ggplot(aes(x= artDate,y=sum,fill=as.factor(topic))) +
geom_col(position="fill")
%>%
posts_topic group_by(artCat,topic) %>%
summarise(sum = n()) %>%
ggplot(aes(x= artCat,y=sum,fill=as.factor(topic))) +
geom_col(position="dodge")
資料合併
# 文章和留言
<- reviews %>%
reviews select(artUrl, cmtPoster, cmtStatus, cmtContent)
<- merge(x = posts, y = reviews, by = "artUrl")
posts_Reviews
# 把文章和topic
<- merge(x = posts_Reviews, y = mask_topics, by.x = "artUrl", by.y="document")
posts_Reviews head(posts_Reviews,3)
取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
<- posts_Reviews %>% select(cmtPoster, artPoster, artUrl)
link head(link,3)
建立網路關係
<- graph_from_data_frame(d=link, directed=T)
reviewNetwork reviewNetwork
# 看一下留言數大概都多少(方便後面篩選)
%>%
posts filter(commentNum<100) %>%
ggplot(aes(x=commentNum)) + geom_histogram()
依據發文數或回覆數篩選post和review
## 帳號發文篇數
= posts %>%
post_count group_by(artPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
post_count
## 帳號回覆總數
= reviews %>%
review_count group_by(cmtPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
review_count
## 發文者
<- post_count %>% filter(count >= 2)
poster_select <- posts %>% filter(posts$artPoster %in% poster_select$artPoster)
posts
## 回覆者
<- review_count %>% filter(count >= 5)
reviewer_select <- reviews %>% filter(reviews$cmtPoster %in% reviewer_select$cmtPoster) reviews
# 檢視參與人數
length(unique(posts_Reviews$artPoster)) # 發文者數量 911
length(unique(posts_Reviews$cmtPoster)) # 回覆者數量 15224
<- c(posts_Reviews$artPoster, posts_Reviews$cmtPoster) # 總參與人數 15641
allPoster length(unique(allPoster))
標記所有出現過得使用者
<- data.frame(user=unique(allPoster)) %>%
userList mutate(type=ifelse(user%in%posts$artPoster, "poster", "replyer"))
head(userList,3)
<- posts_Reviews %>%
link group_by(cmtPoster, artUrl) %>%
filter(n()>3) %>%
filter(commentNum > 5) %>%
filter(artCat=="Gossiping") %>%
filter(artDate == as.Date('2021-05-13')) %>%
select(cmtPoster, artPoster, artUrl) %>%
unique()
link
篩選在link裡面有出現的使用者
<- userList %>%
filtered_user filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
head(filtered_user,3)
因爲圖片箭頭有點礙眼,所以這裏我們先把關係的方向性拿掉,減少圖片中的不必要的資訊 set.seed 因為igraph呈現的方向是隨機的
set.seed(487)
# v=filtered_user
= degree(reviewNetwork) > 2
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
reviewNetwork plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)
加上nodes的顯示資訊 用使用者的身份來區分點的顏色
set.seed(487)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)
可以稍微看出圖中的點(人)之間有一定的關聯,不過目前只有單純圖形我們無法分析其中的內容。 因此以下我們將資料集中的資訊加到我們的圖片中。
為點加上帳號名字,用degree篩選要顯示出的使用者,以免圖形被密密麻麻的文字覆蓋
= 5
filter_degree set.seed(123)
# 設定 node 的 label/ color
<- degree(reviewNetwork) # 算出每個點的degree
labels V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(
reviewNetwork, vertex.size=3,
edge.width=3,
vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)
我們可以看到基本的使用者關係,但是我們希望能夠將更進階的資訊視覺化。 例如:使用者經常參與的文章種類,或是使用者在該社群網路中是否受到歡迎。
抓link 挑選出2021-05-13當天的文章, 篩選一篇文章回覆3次以上者,且文章留言數多餘5則, 文章主題歸類為1、2與6者, 欄位只取:cmtPoster(評論者), artPoster(發文者), artUrl(文章連結), topic(主題)
<- posts_Reviews %>%
link group_by(cmtPoster, artUrl) %>%
filter(n()>3) %>%
filter(commentNum > 5) %>%
filter(artCat=="Gossiping") %>% #HatePolitics / Gossiping
filter(artDate == as.Date('2021-05-13')) %>%
filter( topic == 1 | topic == 2 | topic == 6) %>%
select(cmtPoster, artPoster, artUrl, topic) %>%
unique()
link
抓nodes 在所有的使用者裡面,篩選link中有出現的使用者
<- userList %>%
filtered_user filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
head(filtered_user,3)
= 5
filter_degree
# 建立網路關係
<- graph_from_data_frame(d=link, v=filtered_user, directed=F)
reviewNetwork
# 依據使用者身份對點進行上色
<- degree(reviewNetwork)
labels V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "2", "palevioletred", "lightgreen")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=3, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("批評調侃","報導相關"),
col=c("palevioletred", "lightgreen"), lty=1, cex=1)
= 6 # 使用者degree
filter_degree
# 過濾留言者對發文者的推噓程度
<- posts_Reviews %>%
link filter(artCat=="Gossiping") %>%
filter(commentNum > 10) %>%
filter(cmtStatus!="→") %>%
group_by(cmtPoster, artUrl) %>%
filter( n() > 2) %>%
ungroup() %>%
select(cmtPoster, artPoster, artUrl, cmtStatus) %>%
unique()
# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述
# 篩選link中有出現的使用者
<- userList %>%
filtered_user filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
# 建立網路關係
<- graph_from_data_frame(d=link, v=filtered_user, directed=F)
reviewNetwork
# 依據使用者身份對點進行上色
<- degree(reviewNetwork)
labels V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)
台灣停電事件的討論重點有哪些? 主要分為哪幾種風向? 對於2021-05-21 ~ 2021-05-23收集的文章,大概可以分成嘲諷校正回歸、客觀討論校正回歸這兩種,其他還有著重討論確診個案足跡或和疫苗相關的討論等四種。討論重點多在於統計「數字」、「公布日期」等案例的計算方式。
目前風向最偏哪邊? 客觀討論計算方式的文章不少,但嘲諷、八卦性質的文章居多。
討論校正回歸的社群網路如何分布? 以社群文章數來看,批評嘲諷的網友較多,但從社群網路觀察發現,兩邊的貼文討論聲量都很高。
校正回歸的意見領袖有誰?網友的推噓狀態如何? 因為資料選取的時間較短,只要幾篇回覆量高的貼文,就有機會成為社群中心,在八卦版上,以報導討論為主的意見領袖有 centre0130,回覆推噓皆有,調侃批評部分則有 hstf,網友大多正面推文。