PTT八卦版:鄉民因為近日疫情持續延燒,對政府的一連串帶動網路風向的行為感到不滿。
觀察時間為2021年5月1日~5月29日。在這段期間,台灣本土武漢肺的疫情炎持續延燒,在疫情期間政府稱PTT是中國假訊息的發源地之一,並表示這些是中國對台灣所發起的“認知作戰”,但隨後被鄉民以肉蒐的方式查出其假訊息提供者本身為民進黨支持者,卻在ptt上發表反執政黨的言論有明顯自導自演的跡象,因此,我們嘗試使用文字探勘以及情緒分析的方式來探討廣大網民對於此事件的看法。
1.分析五月份鄉民的討論熱度,有哪些話題高峰? 2.分析正面還有負面文章的內容 3.利用LDA來看看在這個月鄉民主要在討論什麼 4.利用LDA結果,找出各類文章的意見領袖
系統參數設定
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼## [1] ""
安裝需要的packages
# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)讀進library
rm(list=ls(all=T))
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
library(jiebaR)
library(janeaustenr)
library(ngram)
library(widyr)
library(readr)
library(NLP)
library(ggraph)
library(igraph)
library(tm)
library(topicmodels)
library(purrr)
library(RColorBrewer)
library(LDAvis)
library(slam)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)# 把文章和留言讀進來
MetaData = fread('csv/army_a.csv',encoding = 'UTF-8')
Reviews = fread('csv/army_r.csv',encoding = 'UTF-8')
# 再篩一次文章,從1067篩到剩下982
keywords = c('網軍','林瑋豐','自導自演','一條龍','認知作戰','認吱作戰')
toMatch = paste(keywords,collapse="|")
MetaData = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])
# 挑選文章對應的留言,從96537到89176
reviews <- Reviews %>%
select(artUrl, cmtPoster, cmtStatus, cmtContent)
Reviews2 = left_join(MetaData, Reviews[,c("artUrl", "cmtContent","cmtPoster","cmtStatus")], by = "artUrl")
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")(1). 文章斷詞
設定斷詞引擎
# 加入自定義的字典
jieba_tokenizer <- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}# 把文章和留言的斷詞結果併在一起
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) (2). 資料基本清理
# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
data_select = data %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# 算每天不同字的詞頻
# word_count:artDate,word,count
word_count <- data_select %>%
select(artDate,word) %>%
group_by(artDate,word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
P <- read_file("dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("dict/liwc/negative.txt") # 負向字典txt檔
#字典txt檔讀進來是一整個字串
typeof(P)## [1] "character"
建立liwc情緒字典
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #661
N = data.frame(word = N, sentiment = "negative") #1049
# 把兩個字典拼在一起
LIWC = rbind(P, N)#每日統計貼文
date = data %>% select(artDate, artUrl) %>% distinct()
date = date %>% group_by(artDate) %>% summarize(count_day = n()) %>% ungroup()
date = date %>% arrange(desc(count_day))date_plot <- date %>%
ggplot(aes(x = artDate, y = count_day)) +
geom_line(color = "purple", size = 1.5) +
scale_x_date(labels = date_format("%Y/%m/%d")) +
scale_x_date(breaks = "1 week") +
ggtitle("討論文章數") +
geom_line(color="red")+
geom_point()+
xlab("日期") +
ylab("數量")## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
date_plotg_tokens_by_date <- data_select %>% count(artDate, word, sort = TRUE)
plot_merge <- g_tokens_by_date %>%
filter( artDate == as.Date("2021-05-20") |
artDate == as.Date("2021-05-24")) %>%
group_by(artDate) %>%
top_n(7, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n, fill = artDate)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = NULL) +
facet_wrap(~artDate, scales="free", ncol = 2) +
coord_flip()
plot_mergesentiment_count = data_select %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) ## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
range(sentiment_count$artDate)## [1] "2021-05-01" "2021-05-28"
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%Y/%m/%d"),
limits = as.Date(c('2021-05-01','2021-05-29'))
)+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-05-20'))
[1]])),colour = "black",linetype=4) +
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-05-24'))
[1]])),colour = "black",linetype=4) b = data.frame(artDate="2021-05-11",sentiment="negative",count=0)
b$artDate = as.Date(b$artDate,format="%Y-%m-%d")
b$count = as.integer(b$count)
sentiment_count = bind_rows(sentiment_count,b)
sentiment_count %>%
# 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-05-01','2021-05-29'))
)sentiment_count %>% filter(artDate<=as.Date("2021-05-29",format="%Y-%m-%d"))%>%
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-05-20','2021-05-29'))
)+
geom_vline(aes(xintercept = as.integer(as.Date("2021-05-24",format="%Y-%m-%d"))), col = "black",linetype=4)## Warning: Removed 32 row(s) containing missing values (geom_path).
# 畫出文字雲
word_count %>%
filter(artDate == as.Date('2021-05-24')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>200) %>% # 過濾出現太少次的字
wordcloud2()## Adding missing grouping variables: `artDate`
# sentiment_sum:word,sentiment,sum
sentiment_sum <-
word_count %>%
filter(artDate == as.Date('2021-05-24')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum %>%
top_n(20,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()sentiment_sum %>%
acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"), # positive negative
max.words = 50)article_type =
data_select %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=n()) %>%
spread(sentiment,count,fill = 0) %>% # 把正負面情緒展開,缺值補0
mutate(type = case_when(positive > negative ~ "positive",
TRUE ~ "negative")) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
# 看一下正負比例的文章各有幾篇
article_type %>%
group_by(type) %>%
summarise(count = n())## # A tibble: 2 x 2
## type count
## * <chr> <int>
## 1 negative 497
## 2 positive 196
article_type_date = left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")
article_type_date$artDate = as.Date(article_type_date$artDate,format="%Y/%m/%d")
article_type_date %>%
group_by(artDate,type) %>%
summarise(count = n()) %>%
ggplot(aes(x = artDate, y = count, fill = type)) +
geom_bar(stat = "identity", position = "dodge")+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-05-01','2021-05-29'))
)## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## Warning: Removed 1 rows containing missing values (geom_bar).
# 格式化日期欄位
MToken$artDate= MToken$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
data_select_a = MToken %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# 算每天不同字的詞頻
# word_count:artDate,word,count
tokens <- data_select_a %>%
select(artUrl,word) %>%
group_by(artUrl,word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
# 過濾出現太少次的字
arrange(desc(count))## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm## <<DocumentTermMatrix (documents: 716, terms: 11505)>>
## Non-/sparse entries: 38281/8199299
## Sparsity : 100%
## Maximal term length: 8
## Weighting : term frequency (tf)
inspect(dtm[1:10,1:10])## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 38/62
## Sparsity : 62%
## Maximal term length: 4
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 林瑋豐 肥肥 阿伯
## https://www.ptt.cc/bbs/Gossiping/M.1621540161.A.405.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621572487.A.59C.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621687845.A.8D8.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621768043.A.BEC.html 0 24 20
## https://www.ptt.cc/bbs/Gossiping/M.1621857669.A.025.html 3 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621879761.A.AFA.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621895979.A.0DC.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621919105.A.780.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621930555.A.3E2.html 18 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621956282.A.6B8.html 27 0 0
## Terms
## Docs 政府 柯文哲 帳號
## https://www.ptt.cc/bbs/Gossiping/M.1621540161.A.405.html 8 1 0
## https://www.ptt.cc/bbs/Gossiping/M.1621572487.A.59C.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621687845.A.8D8.html 2 44 0
## https://www.ptt.cc/bbs/Gossiping/M.1621768043.A.BEC.html 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621857669.A.025.html 9 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621879761.A.AFA.html 1 0 24
## https://www.ptt.cc/bbs/Gossiping/M.1621895979.A.0DC.html 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621919105.A.780.html 32 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1621930555.A.3E2.html 0 0 7
## https://www.ptt.cc/bbs/Gossiping/M.1621956282.A.6B8.html 0 0 4
## Terms
## Docs 資訊 監督 網軍
## https://www.ptt.cc/bbs/Gossiping/M.1621540161.A.405.html 1 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1621572487.A.59C.html 23 0 2
## https://www.ptt.cc/bbs/Gossiping/M.1621687845.A.8D8.html 0 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1621768043.A.BEC.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621857669.A.025.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621879761.A.AFA.html 2 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1621895979.A.0DC.html 0 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1621919105.A.780.html 2 22 26
## https://www.ptt.cc/bbs/Gossiping/M.1621930555.A.3E2.html 2 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621956282.A.6B8.html 1 0 0
## Terms
## Docs 認知作戰
## https://www.ptt.cc/bbs/Gossiping/M.1621540161.A.405.html 17
## https://www.ptt.cc/bbs/Gossiping/M.1621572487.A.59C.html 1
## https://www.ptt.cc/bbs/Gossiping/M.1621687845.A.8D8.html 1
## https://www.ptt.cc/bbs/Gossiping/M.1621768043.A.BEC.html 5
## https://www.ptt.cc/bbs/Gossiping/M.1621857669.A.025.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1621879761.A.AFA.html 1
## https://www.ptt.cc/bbs/Gossiping/M.1621895979.A.0DC.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1621919105.A.780.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1621930555.A.3E2.html 6
## https://www.ptt.cc/bbs/Gossiping/M.1621956282.A.6B8.html 6
ldas = c()
topics = c(2,3,4,6,10,15)
for(topic in topics){
start_time <- Sys.time()
lda <- LDA(dtm, k = topic, control = list(seed = 2021))
ldas =c(ldas,lda)
print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
}## [1] "2 topic(s) and use time is 2.29373812675476"
## [1] "3 topic(s) and use time is 4.29079914093018"
## [1] "4 topic(s) and use time is 5.1099898815155"
## [1] "6 topic(s) and use time is 8.52049994468689"
## [1] "10 topic(s) and use time is 14.0820209980011"
## [1] "15 topic(s) and use time is 27.0083231925964"
topics = c(2,3,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
topicmodels_json_ldavis <- function(fitted, doc_term){
require(LDAvis)
require(slam)
###以下function 用來解決,主題數多會出現NA的問題
### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi){
jensenShannon <- function(x, y) {
m <- 0.5 * (x + y)
lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
0.5 * sum(lhs) + 0.5 * sum(rhs)
}
dist.mat <- proxy::dist(x = phi, method = jensenShannon)
pca.fit <- stats::cmdscale(dist.mat, k = 2)
data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(doc_term)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
doc.length = as.vector(table(doc_term$i)),
term.frequency = term_freq, mds.method = ls_LDA)
return(json_lda)
}the_lda = ldas[[2]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
#serVis(json_res,open.browser = T)the_lda = ldas[[2]] ## 選定topic 為 3 的結果topics_words <- tidy(the_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)## # A tibble: 10 x 3
## topic term phi
## <int> <chr> <dbl>
## 1 2 林瑋豐 0.0257
## 2 3 認知作戰 0.0226
## 3 3 網軍 0.0168
## 4 1 網軍 0.0144
## 5 2 反串 0.0141
## 6 3 台灣 0.0136
## 7 2 民進黨 0.0131
## 8 2 認知作戰 0.0130
## 9 2 帳號 0.0127
## 10 2 疾管家 0.0124
topics_words %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
xlab("topic")+
scale_x_reordered() ##主題命名
topics_name = c("鄉民以蟑螂統稱網軍","鄉民起底林瑋豐的PTT帳號","鄉民認為認知作戰是網軍自導自演")# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
n_distinct(MetaData$artDate)## [1] 26
document_topics <- doc_pro[MetaData$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(MetaData,document_topics_df)news_topic %>%
arrange(desc('鄉民以蟑螂統稱網軍')) %>% head(10) ## artTitle artDate artTime
## 1: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 04:01:33
## 2: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 07:09:55
## 3: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 07:23:41
## 4: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 07:27:51
## 5: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 08:04:52
## 6: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 08:17:13
## 7: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 08:46:39
## 8: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 09:00:27
## 9: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 09:54:12
## 10: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 12:07:49
## artUrl artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1619841695.A.1E4.html l42857
## 2: https://www.ptt.cc/bbs/Gossiping/M.1619852997.A.8EF.html cary4A240022
## 3: https://www.ptt.cc/bbs/Gossiping/M.1619853823.A.AF5.html chal
## 4: https://www.ptt.cc/bbs/Gossiping/M.1619854081.A.FE5.html ig49999
## 5: https://www.ptt.cc/bbs/Gossiping/M.1619856295.A.C77.html kingfsg7326
## 6: https://www.ptt.cc/bbs/Gossiping/M.1619857036.A.76C.html CPer
## 7: https://www.ptt.cc/bbs/Gossiping/M.1619858801.A.3B6.html alpac
## 8: https://www.ptt.cc/bbs/Gossiping/M.1619859630.A.6EC.html chirex
## 9: https://www.ptt.cc/bbs/Gossiping/M.1619862854.A.C2E.html joe255118
## 10: https://www.ptt.cc/bbs/Gossiping/M.1619870872.A.AF9.html limoncool
## artCat commentNum push boo
## 1: Gossiping 11 7 1
## 2: Gossiping 597 337 61
## 3: Gossiping 36 12 4
## 4: Gossiping 162 112 6
## 5: Gossiping 7 2 0
## 6: Gossiping 41 18 2
## 7: Gossiping 3 0 0
## 8: Gossiping 20 14 0
## 9: Gossiping 32 2 6
## 10: Gossiping 11 6 0
## sentence
## 1: 呱吉講的沒錯啊,\n比一下八卦版、D卡及臉書噗浪等社群,\nPTT明顯就是有網軍再經營,\n等一下就會來集體噓文了。\n\n
## 2: PTT只剩下一種顏色的網軍\n\n然後一直影射是民眾黨\n\n結果被抓出來的都是綠營的\n\n想想當初的3000英靈殿\nhttps://i.imgur.com/hzhEupZ.jpg\n直到現在這次事件被抓出來的還是綠的\n\n結果他還是堅持說不改變自己的看法\n\n這種只有顏色沒有是非\n\n連活生生擺在眼前的證據的可能當看不見的人\n\n可憐哪\nhttps://i.imgur.com/Y304H9R.jpg\n-\nSent from JPTT on my iPhone\n\n>所以他們都是白色網軍(得證)
## 3: 這事情很簡單\n\n八掛版有時走在社會前\n\n現在卻與現實脫節\n\n柯支持者的族群\n\n年齡層應該落在低卡\n\n反而佔領了年紀偏高的PTT\n\n不合常情\n\n而且cf支持率也沒有提昇\n\n本來PTT只有選前煩\n\n承平時期應該要各種有趣\n\n柯這種打法根本外行浪費錢\n\n也搞爛了PTT\n\n莫非他真以為他支持率降低是被帶風向嗎\n\n台北市沒搞好\n\n連PTT也不放過\n
## 4: 幫大家複習一下\n\n這是2019/12/26\n\n呱吉在自己的頻道上傳的影片\n\n放話\nhttps://youtu.be/Lin3joRaNdg?t=232\n然後過了一年多了\n\n2021/4/29於臉書張貼這個留言\nhttps://i.imgur.com/Y6yuHw1.png\n幹我要爆料\n一年多前說好要抄網軍的底\n\n一年多後說要爆關說議員的料\n\n更不用說前陣子只剩OO黨在養網軍\n\n然後都是\n\n放話!\n\n放話!\n\n放話!\n\n說好的民主開箱卻什麼都只說一半\n\n還是又沒證據只能影射?\n\n名副其實的「民主封箱 苦無證據」\n\n至於真相是什麼\n\n恐怕只有呱吉本人知道了........\n選前喊民主開箱 選後民主封箱\n被酸很正常啊
## 5: 個人雖然覺得韓粉=智障\n\n但至少韓粉敢大聲挺自己想挺的韓導\n\n綠蟑螂反而跟孬種一樣,永遠在背後裝中立\n\n聽到別人說自己是小綠、綠營支持者,這些蟑螂會氣得要死\n\n但又聽到有人在講綠營的不是馬上氣噗噗反擊\n\n當韓粉雖然很丟臉\n\n但當假中立的綠蟑螂更下系下井\n某垃圾x藍粉專才好笑 太魯閣才過沒多久\n馬上發文檢討柯文哲是交通殺手,下面有人留太魯閣馬上被刪留言\n都不是阿 不過有些人會認為只能兩邊擇一站\n他們的腦袋組成可能只有0或1而已 ㄎㄎ
## 6: 身為一個高知識份子\n明明很多證據都擺在眼前 卻裝作看不見\n卻還是要硬說影射別人是網軍卻又苦無證據\n\n這除了很扯之外可能根本別有用心了吧\n我看又是個為了傳說中的抗中保台\n可以不顧一切所有不正義行為的人吧\n
## 7: 大家不用再猜了\nhttps://youtu.be/G2DF9d-H6XQ?t=181\n依照南宮博士在2019年的說法,應該就是KMT了\n\n\n這麼老人的政黨也會玩網軍\n\n這種說法,就好像是在說我阿嬤會打世紀帝國一樣\n
## 8: 呱吉當然不敢出來講話啦。\n\n按他的個性,\n\n今天如果養網軍的是國民黨、民眾黨、時力、甚至統促黨,\n\n他早就第一個跳出來開幹嗆聲,\n\n說要跟對方沒完沒了,\n\n在議會都敢直接點名開幹國民黨老議員的大砲新議員,\n\n在高雄大港開唱可以公開表示要幹爆韓國瑜跟高雄人,\n\n這麼有Rock精神的人,反正聲明過只作一屆市議員,\n\n什麼不敢講?什麼都不怕!!\n\n至今都不敢出來表態說是什麼顏色有網軍?\n\n其實答案早就呼之欲出了,\n\n只是他就是不敢明講,\n\n不為什麼,\n\n講了會失去很多粉。\n\n\n\n什麼顏色這麼厲害?\n\n最近這幾天終於知道了答案了,\n\n原來是【黑色】。\n\n\n\n\n
## 9: ptt都綠共1450\n\n然後小英跟光頭都被噓爆\n\n時神也不知下神壇下幾次了\n\n呱吉這種綠共側翼\n\n在充斥1450的八卦應該要推高高阿\n\n結果每篇都在喬他\n\n認同他的都被噓\n\n都沒人覺得怪怪的喔?\n\n3000英靈都不知道多久了\n\n還整天3000英靈\n\n影射某黨有網軍\n\n結果整個版都在喬他\n\n只會看起來呱吉是對的不是嗎= =\n\n如果呱吉是錯的\n\n應該整個版都在推這個垃圾綠共才對啊\n\n
## 10: 怎麼到現在還有人在檢討呱吉說的話阿\n\n當初呱吉出來選我就知道這個人絕對有顏色\n\n那時候會信他的鄉民我只能說當時智商一定很低\n\n只要偶爾抓個帳號上來講幾句屁話\n\n鄉民就推爆 真的是有夠白痴的\n\n那時候我就不相信這個網紅講的任何話了\n\n大家都嘛知道台灣的網紅 品信為人 有幾個能信的\n\n要跨足政治 基本上也一定是在某個黨吸夠奶水才敢出來\n\n至於呱吉是吸了哪個黨的奶水 我相信在座的各位應該都很清楚\n\n只能說現在還在檢討他的人 我不知道在想什麼 因為他一開始立場就很明顯了\n\n你現在再來檢討 不就只是在透露你也是當初那817萬低智商的人之一嗎\n\n只有蠢蛋發現自己被騙才會這麼氣\n\n真正的智者這時候早就笑而不語\n\n不過現在看清呱吉這個人也不遲拉\n\n反正這次事件我猜他連個屁都不敢放 ㄏ\n\n八卦版到底只有誰的網軍 我相信只有最常使用八卦版的人才最清楚\n\n每次都會有一堆陌生帳號發大內宣新聞來這裡\n\n不然就是醜化在野黨的新聞\n\n還有帳號負責帶風向洗風向 不論是非只論顏色\n\n只剩哪個顏色的網軍 我相信非常清楚
## 鄉民以蟑螂統稱網軍 鄉民起底林瑋豐的PTT帳號 鄉民認為認知作戰是網軍自導自演
## 1: 0.003213154 0.1855164232 0.8112704230
## 2: 0.996652303 0.0016738484 0.0016738485
## 3: 0.997258472 0.0013707639 0.0013707639
## 4: 0.997987339 0.0010063304 0.0010063304
## 5: 0.998290358 0.0008548210 0.0008548210
## 6: 0.002255731 0.0022557309 0.9954885383
## 7: 0.992514667 0.0037426665 0.0037426666
## 8: 0.293017911 0.0008390308 0.7061430585
## 9: 0.996985559 0.0015072205 0.0015072205
## 10: 0.998932958 0.0005335211 0.0005335211
news_topic %>%
select(-c("commentNum","push","boo")) %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate = format(artDate,'%Y%m%d')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors[c(1,5,8,12)])+
theme(axis.text.x = element_text(angle = 90, hjust = 1))#這個月的總表
news_topic %>%
select(-c("commentNum","push","boo")) %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors[c(1,5,8,12)])+
theme(axis.text.x = element_text(angle = 90, hjust = 1))g_topics <- tidy(the_lda, matrix="gamma") %>% # document topic gamma
group_by(document) %>%
top_n(1, wt=gamma)
g_topics## # A tibble: 716 x 3
## # Groups: document [716]
## document topic gamma
## <chr> <int> <dbl>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1621687845.A.8D8.html 1 1.00
## 2 https://www.ptt.cc/bbs/Gossiping/M.1621879761.A.AFA.html 1 1.00
## 3 https://www.ptt.cc/bbs/Gossiping/M.1621873431.A.00C.html 1 0.999
## 4 https://www.ptt.cc/bbs/Gossiping/M.1621894862.A.D00.html 1 0.937
## 5 https://www.ptt.cc/bbs/Gossiping/M.1619885923.A.D43.html 1 0.999
## 6 https://www.ptt.cc/bbs/Gossiping/M.1621898446.A.733.html 1 1.00
## 7 https://www.ptt.cc/bbs/Gossiping/M.1621577071.A.9A6.html 1 0.863
## 8 https://www.ptt.cc/bbs/Gossiping/M.1621215720.A.4A6.html 1 0.999
## 9 https://www.ptt.cc/bbs/Gossiping/M.1621563056.A.B32.html 1 1.00
## 10 https://www.ptt.cc/bbs/Gossiping/M.1621848853.A.C92.html 1 0.999
## # ... with 706 more rows
# 把文章和topic
Reviews2 <- merge(x = Reviews2, y = g_topics, by.x = "artUrl", by.y="document")
head(Reviews2,3)## artUrl
## 1: https://www.ptt.cc/bbs/Gossiping/M.1619841695.A.1E4.html
## 2: https://www.ptt.cc/bbs/Gossiping/M.1619841695.A.1E4.html
## 3: https://www.ptt.cc/bbs/Gossiping/M.1619841695.A.1E4.html
## artTitle artDate artTime artPoster
## 1: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 04:01:33 l42857
## 2: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 04:01:33 l42857
## 3: Re:[新聞]PTT剩1種顏色網軍?呱吉親爆內幕 3層「 2021/05/01 04:01:33 l42857
## artCat commentNum push boo
## 1: Gossiping 11 7 1
## 2: Gossiping 11 7 1
## 3: Gossiping 11 7 1
## sentence
## 1: 呱吉講的沒錯啊,\n比一下八卦版、D卡及臉書噗浪等社群,\nPTT明顯就是有網軍再經營,\n等一下就會來集體噓文了。\n\n
## 2: 呱吉講的沒錯啊,\n比一下八卦版、D卡及臉書噗浪等社群,\nPTT明顯就是有網軍再經營,\n等一下就會來集體噓文了。\n\n
## 3: 呱吉講的沒錯啊,\n比一下八卦版、D卡及臉書噗浪等社群,\nPTT明顯就是有網軍再經營,\n等一下就會來集體噓文了。\n\n
## cmtContent cmtPoster cmtStatus
## 1: :講的好勞工開始上班囉 allnike 噓
## 2: :苦無證據我知道 DustToDust 推
## 3: :雖然PTT白網軍超誇張但有很多事他們反對的行為是對的 ahaha777 推
## topic gamma
## 1: 3 0.8112704
## 2: 3 0.8112704
## 3: 3 0.8112704
posts_Reviews = Reviews2link <- posts_Reviews %>% select(cmtPoster, artPoster, artUrl)
head(link,3)## cmtPoster artPoster
## 1: allnike l42857
## 2: DustToDust l42857
## 3: ahaha777 l42857
## artUrl
## 1: https://www.ptt.cc/bbs/Gossiping/M.1619841695.A.1E4.html
## 2: https://www.ptt.cc/bbs/Gossiping/M.1619841695.A.1E4.html
## 3: https://www.ptt.cc/bbs/Gossiping/M.1619841695.A.1E4.html
##基本網路圖 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, directed=T)## Warning in graph_from_data_frame(d = link, directed = T): In `d' `NA' elements
## were replaced with string "NA"
reviewNetwork## IGRAPH 3ce0a3e DN-- 15544 78356 --
## + attr: name (v/c), artUrl (e/c)
## + edges from 3ce0a3e (vertex names):
## [1] allnike ->l42857 DustToDust ->l42857
## [3] ahaha777 ->l42857 ahaha777 ->l42857
## [5] junnn ->l42857 Lailungsheng->l42857
## [7] abs7862604 ->l42857 tumv ->l42857
## [9] quisk ->l42857 chanceiam ->l42857
## [11] tomx ->l42857 tryagain24 ->cary4A240022
## [13] kc ->cary4A240022 SayNoToPanda->cary4A240022
## [15] loserloser ->cary4A240022 dk2ftrmrn ->cary4A240022
## + ... omitted several edges
##資料篩選
#看一下留言數大概都多少(方便後面篩選)
MetaData %>%
# filter(commentNum<100) %>%
ggplot(aes(x=commentNum)) + geom_histogram()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
posts = MetaData
# 帳號發文篇數
post_count = posts %>%
group_by(artPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
post_count## # A tibble: 553 x 2
## artPoster count
## <chr> <int>
## 1 CPer 6
## 2 HollisterCo 6
## 3 kons 6
## 4 aure0914 5
## 5 chirex 5
## 6 ae034 4
## 7 Behind4 4
## 8 Birthday5566 4
## 9 ClutchShot 4
## 10 ig49999 4
## # ... with 543 more rows
# 帳號回覆總數
review_count = Reviews2 %>%
group_by(cmtPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
review_count## # A tibble: 15,348 x 2
## cmtPoster count
## <chr> <int>
## 1 s9234032 338
## 2 trywish 195
## 3 leecliff 150
## 4 butten986 144
## 5 jma306 135
## 6 powderzhon 122
## 7 VOLK11 120
## 8 Annis812 115
## 9 TZUYIC 115
## 10 yehpi 115
## # ... with 15,338 more rows
# 發文者
poster_select <- post_count %>% filter(count >= 2)
posts <- posts %>% filter(posts$artPoster %in% poster_select$artPoster)
# 回覆者
reviewer_select <- review_count %>% filter(count >= 20)
Reviews3 <- Reviews2 %>% filter(Reviews2$cmtPoster %in% reviewer_select$cmtPoster)# 檢視參與人數
length(unique(posts_Reviews$artPoster)) # 發文者數量 553## [1] 553
length(unique(posts_Reviews$cmtPoster)) # 回覆者數量 15348## [1] 15348
allPoster <- c(posts_Reviews$artPoster, posts_Reviews$cmtPoster) # 總參與人數 15544
length(unique(allPoster))## [1] 15544
標記所有出現過得使用者
poster:只發過文、發過文+留過言 replyer:只留過言
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%posts$artPoster, "poster", "replyer"))
head(userList,3)## user type
## 1 l42857 poster
## 2 cary4A240022 poster
## 3 chal replyer
##以日期篩選社群
#0524那天po文數最多
link <- posts_Reviews %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>3) %>%
filter(commentNum > 200) %>%
filter(artCat=="Gossiping") %>%
filter(artDate == as.Date('2021-05-24')) %>%
select(cmtPoster, artPoster, artUrl) %>%
unique()
link## # A tibble: 647 x 3
## # Groups: cmtPoster, artUrl [647]
## cmtPoster artPoster artUrl
## <chr> <chr> <chr>
## 1 TZUYIC Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.A.930.~
## 2 newmp4 Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.A.930.~
## 3 neoa01 Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.A.930.~
## 4 tupacshkur Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.A.930.~
## 5 jimmyso Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.A.930.~
## 6 phoenixhong Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.A.930.~
## 7 kroutony Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.A.930.~
## 8 mice2 Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.A.930.~
## 9 s9234032 borondawon https://www.ptt.cc/bbs/Gossiping/M.1621833645.A.E50.~
## 10 amida959 borondawon https://www.ptt.cc/bbs/Gossiping/M.1621833645.A.E50.~
## # ... with 637 more rows
#篩選在link裡面有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
head(filtered_user,3)## user type
## 1 q347 replyer
## 2 IDfor2010 replyer
## 3 startupownbs replyer
set.seed(487)
v=filtered_user
reviewNetwork = degree(reviewNetwork) > 2
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)filter_degree = 20
set.seed(123)
# 設定 node 的 label/ color
labels <- degree(reviewNetwork) # 算出每個點的degree
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(
reviewNetwork,
vertex.size=4,
edge.width=1,
vertex.label.dist=2,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2) ##以主題篩選社群
link <- posts_Reviews %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>3) %>%
filter(commentNum > 200) %>%
filter(artCat=="Gossiping") %>% #HatePolitics / Gossiping
filter(artDate == as.Date('2021-05-24')) %>%
select(cmtPoster, artPoster, artUrl, topic) %>%
unique()
link## # A tibble: 647 x 4
## # Groups: cmtPoster, artUrl [647]
## cmtPoster artPoster artUrl topic
## <chr> <chr> <chr> <int>
## 1 TZUYIC Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.~ 2
## 2 newmp4 Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.~ 2
## 3 neoa01 Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.~ 2
## 4 tupacshkur Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.~ 2
## 5 jimmyso Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.~ 2
## 6 phoenixhong Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.~ 2
## 7 kroutony Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.~ 2
## 8 mice2 Beatrice322 https://www.ptt.cc/bbs/Gossiping/M.1621822128.~ 2
## 9 s9234032 borondawon https://www.ptt.cc/bbs/Gossiping/M.1621833645.~ 3
## 10 amida959 borondawon https://www.ptt.cc/bbs/Gossiping/M.1621833645.~ 3
## # ... with 637 more rows
#抓nodes 在所有的使用者裡面,篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))filter_degree = 30
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "red")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <-case_when(E(reviewNetwork)$topic == "1" ~"palevioletred",
E(reviewNetwork)$topic == "2" ~"lightgreen",
E(reviewNetwork)$topic == "3" ~"lightblue")
# 畫出社群網路圖(degree>30的才畫出來)
set.seed(5432)
plot(reviewNetwork, vertex.size=4, edge.width=1.5, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
legend("topleft", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","red"), pt.cex=1, cex=1)
legend("bottomright", c("蟑螂網軍","鄉民起底林瑋豐","認知作戰"),
col=c("palevioletred", "lightgreen","lightblue"), lty=1, cex=1)filter_degree = 10 # 使用者degree
# 過濾留言者對發文者的推噓程度
link <- posts_Reviews %>%
filter(artCat=="Gossiping") %>%
filter(commentNum > 100) %>%
filter(cmtStatus!="→") %>%
group_by(cmtPoster, artUrl) %>%
filter( n() > 2) %>%
ungroup() %>%
select(cmtPoster, artPoster, artUrl, cmtStatus) %>%
unique()
# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "red")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=3, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","red"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)對於2021-05-01 ~ 2021-05-29收集的文章,大概可以分成「鄉民以蟑螂統稱網軍」、「鄉民起底帶風向者」、「討論認知作戰」這三種。
五月的社群網路如何分布? 以整個五月的社群文章數來看,「認知作戰」有很穩定的討論量,但短期而言,當「林瑋豐(反串被起底)」發生之後,在短時間內有很熱烈地討論。
在5/24日裡,三種主題的意見領袖有誰?網友的推噓狀態如何? 以「蟑螂網軍」為主的意見領袖帳號是:ahhhhaaaa,該文章有1427則回覆,推文有767,噓文只有14。以「鄉民起底林瑋豐帳號」為主的意見領袖帳號是:golang,在5/24當天甚至有兩篇討論熱度高的文章。以「認知作戰自導自演」為主的意見領是帳號:MapleT回覆推噓皆有。
但實際瀏覽過後可以發現,推文者大多以正面的字詞諷刺網軍帶風向的行為,因此整體來說還是以負面留言和文章居多。