隨著2020年總統大選的到來,網路與媒體也開始有沸沸揚揚的聲浪,有鑑於上次2018年九合一大選中縣市首長的選舉與網路有著密不可分的關聯,我們這次以抓取PTT資料的方式來分析這些候選人的意見領袖,希望藉由查看他們是用什麼方式來炒起聲量?他們帶起的風向的是正面還是負面形象?成功與否?彼此之間有什麼差異?來判斷這次網路聲量上能夠占一席之地的參選人會是誰。
備註:我們在挑選分析的參選人的方式,除了有宣布參選者,還包括未宣布但是呼聲高的人,如柯文哲。
library(knitr)
library(dplyr)
library(kableExtra)
knitr::opts_chunk$set(echo = TRUE)
options(knitr.table.format = "html")
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(stringr)
library(ggplot2)
library(data.table)
library(igraph)
library(wordcloud2)
library(stringi)
library(widyr)
library(scales)
require(tm)
require(data.table)
require(quanteda)
require(Matrix)
require(slam)
require(Rtsne)
require(randomcoloR)
library(magrittr)
library(tsne)
資料來源:讀取ptt六位參選人的討論資料、討論串回覆資料(2019.1.1-2019.5.31)
li = fread("data/賴清德_articleMetaData.csv",encoding="UTF-8")
li_r = fread("data/賴清德_articleMetaData_response.csv",encoding="UTF-8")
tsai = fread("data/蔡英文_articleMetaData.csv",encoding="UTF-8")
tsai_r = fread("data/蔡英文_articleMetaData_response.csv",encoding="UTF-8")
zhu1 = read_csv("data/朱立倫_articleMetaData_response.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentPoster = col_character(),
## commentStatus = col_character(),
## commentDate = col_datetime(format = ""),
## commentContent = col_character()
## )
han1 = read_csv("data/韓國瑜_articleMetaData_reponse.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentPoster = col_character(),
## commentStatus = col_character(),
## commentDate = col_datetime(format = ""),
## commentContent = col_character()
## )
han = read_csv("data/韓國瑜_articleMetaData.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
zhu = read_csv("data/朱立倫_articleMetaData.csv")
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
kp = fread("data/柯文哲_articleMetaData.csv",encoding="UTF-8")
kp_r = fread("data/柯文哲_articleMetaData_response.csv",encoding="UTF-8")
kuo = fread("data/郭台銘_articleMetaData.csv",encoding="UTF-8")
kuo_r = fread("data/郭台銘_articleMetaData_response.csv",encoding="UTF-8")
# 選出需要的欄位
li_r <- li_r[,c(4,7,8,10)]
colnames(li_r)=c("artUrl", "cmtPoster", "cmtStatus"," cmtContent")
# 發文者數量 1830
length(unique(li$artPoster))
## [1] 1830
# 回覆者數量 40551
length(unique(li_r$cmtPoster))
## [1] 40551
# 總共有參與的人數 41019
allPoster <- c(li$artPoster, li_r$cmtPoster)
length(unique(allPoster)) #去掉重複
## [1] 41019
# 整理所有出現過的使用者
# 如果曾發過文的話就標註他爲poster;沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%li$artPoster, "poster", "replyer"))
# 把評論和文章依據artUrl innerJoin起來
li_all <- merge(x = li, y = li_r, by = "artUrl")
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- li_all %>%
dplyr::select(cmtPoster, artPoster, artUrl)
# 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西
# 建立網路關係
net <- graph_from_data_frame(d=link, directed=T)
因網路評論人數眾多,設定發文數大於5,回覆則數500以上,社群degree大於5,才會納入圖片討論。
因網路評論人數眾多,我們去設定發文數跟回文數達一定數量才會列入圖片。 篩發文數可以代表那個發文者是不是高度關注該參選人並熱衷於分享;篩回文數可以代表那個發文者的文章能夠一定引起共鳴。
篩掉6009個帳號,留下常活動的272個。
# 篩選回應數和發文次數
table(li$commentNum>=500)
##
## FALSE TRUE
## 6009 272
li_poster=table(li$artPoster) %>% sort %>% as.data.frame
colnames(li_poster)=c("artPoster","freq")
li_poster=li_poster %>% filter(freq>=5)
link <- li_all %>%
filter(commentNum >=500) %>% #回應數大於500則
filter(artPoster==li_poster$artPoster) %>% #發文次數>5次
#filter(cmtStatus!="→") %>% # ptt篩出推噓
select(cmtPoster, artPoster, artUrl, cmtStatus)
## Warning in `==.default`(artPoster, li_poster$artPoster): 較長的物件長度並非
## 較短物件長度的倍數
## Warning in is.na(e1) | is.na(e2): 較長的物件長度並非較短物件長度的倍數
# 這邊要篩選link中有出現的使用者(否則沒有在link中出現的使用者也會被igraph畫上去,沒有意義)
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
# # 建立網路關係
# set.seed(487)
# net <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# # DEGREE大於10 將印出LABEL否則則無
# labels <- degree(net)
# V(net)$label <- names(labels)
#
# V(net)$color <- ifelse(V(net)$type=="poster", "gold", "lightblue")
# # 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
# E(net)$color <- ifelse(E(net)$cmtStatus == "推", "lightgreen", "palevioletred")
#
# plot(net, vertex.size=2, edge.arrow.size=.2,
# vertex.label=ifelse(degree(net) > 7, V(net)$label, NA), vertex.label.ces=.5)
# # 加入標示
# legend(x=-1.5, y=1, c("發文者","回文者"), pch=21,
# col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1,
# text.width=1,x.intersp=-2,adj=1,y.intersp=1,bty="n")
#
# legend(x=-2, y=0, c("推","噓"),
# col=c("lightgreen","palevioletred"), lty=1, cex=1,
# text.width=3,x.intersp=0,adj=2,y.intersp=1,bty="n")
因為綠營支持者在回覆或發布消息時,支持蔡英文和賴清德會有所重疊,故賴清德的社群網路圖中,前幾名的意見領袖雖然會提及賴清德,但某些帳號經對比之後,較為支持小英(如Wojnarowski、TWOOOOOOOOOO等),故分別抓出幾名較為支持賴清德的意見領袖作比較分析。
我們抓出三名意見領袖:shared、youhow0418、luke7212
雖然社群網路較為分散,但群體之間仍有一定程度連結,以上抓出帳號,不一定發文數或回應數最多,但相對其他帳號較能激起社群討論,引導風向。
如發文內容、情緒、頻率
# 一號(這五個月發了51次文)
li_leader1 = li %>% filter(artPoster=="shared")
summary(li_leader1)
## artTitle artDate artTime
## Length:51 Length:51 Length:51
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## artUrl artPoster artCat commentNum
## Length:51 Length:51 Length:51 Min. : 20.0
## Class :character Class :character Class :character 1st Qu.: 61.5
## Mode :character Mode :character Mode :character Median : 99.0
## Mean : 241.2
## 3rd Qu.: 268.0
## Max. :1479.0
## push boo sentence
## Min. : 4.0 Min. : 4.00 Length:51
## 1st Qu.: 19.0 1st Qu.: 12.50 Class :character
## Median : 37.0 Median : 21.00 Mode :character
## Mean :110.1 Mean : 41.35
## 3rd Qu.:115.0 3rd Qu.: 41.50
## Max. :759.0 Max. :209.00
li_leader1$artDate = as.Date(li_leader1$artDate)
li_leader1= li_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader1time = li_leader1 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="shared" )
#二號(這五個月發了25次文)
li_leader2 = li %>% filter(artPoster=="youhow0418")
summary(li_leader2)
## artTitle artDate artTime
## Length:25 Length:25 Length:25
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## artUrl artPoster artCat commentNum
## Length:25 Length:25 Length:25 Min. : 13.0
## Class :character Class :character Class :character 1st Qu.: 65.0
## Mode :character Mode :character Mode :character Median : 97.0
## Mean : 244.4
## 3rd Qu.: 328.0
## Max. :1474.0
## push boo sentence
## Min. : 6.0 Min. : 0.00 Length:25
## 1st Qu.: 20.0 1st Qu.: 15.00 Class :character
## Median : 26.0 Median : 25.00 Mode :character
## Mean :116.6 Mean : 37.24
## 3rd Qu.:159.0 3rd Qu.: 42.00
## Max. :675.0 Max. :156.00
li_leader2$artDate = as.Date(li_leader2$artDate)
li_leader2= li_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader2time = li_leader2 %>%group_by(months) %>%
summarise(num=n()) %>%
as.data.frame%>%
mutate( poster ="youhow0418" )
#三號(這五個月發了16次文)
li_leader3 = li %>% filter(artPoster=="luke7212")
summary(li_leader3)
## artTitle artDate artTime
## Length:16 Length:16 Length:16
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## artUrl artPoster artCat commentNum
## Length:16 Length:16 Length:16 Min. : 24.0
## Class :character Class :character Class :character 1st Qu.: 56.5
## Mode :character Mode :character Mode :character Median : 169.0
## Mean : 359.1
## 3rd Qu.: 645.8
## Max. :1056.0
## push boo sentence
## Min. : 11.00 Min. : 1.00 Length:16
## 1st Qu.: 19.75 1st Qu.: 14.75 Class :character
## Median :101.00 Median : 27.50 Mode :character
## Mean :226.81 Mean : 60.75
## 3rd Qu.:412.25 3rd Qu.: 73.25
## Max. :711.00 Max. :335.00
li_leader3$artDate = as.Date(li_leader3$artDate)
li_leader3= li_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader3time = li_leader3 %>%group_by(months) %>%
summarise(num=n()) %>%
as.data.frame%>%
mutate( poster ="luke7212" )
# 整合他們的發文趨勢圖
li_leader = rbind(li_leader1time,li_leader2time,li_leader3time)
li_leader %>% ggplot(aes(x= months,y=num,fill=poster)) +geom_bar(stat = "identity")+
facet_wrap(~poster, ncol = 2, scales = "fixed")
從圖中可以發現,三名發文者隨月份有越來越常發文的趨勢,特別集中於四月與五月,估計因逼近民進黨黨內初選,讓賴清德討論熱度升高,也為明年年初的總統大選做準備。
接下來進行下一步的內容分析。
先從發文量明顯大於其他的前三位進行內容分析。
# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(li_leader1$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(li_leader1$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# devotion_words %>%
# group_by(word) %>%
# summarise(sum = n())%>%
# filter(sum>3) %>%
# arrange(desc(sum)) %>% wordcloud2(minSize = 3)
# 二號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(li_leader2$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(li_leader2$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# devotion_words %>%
# group_by(word) %>%
# summarise(sum = n())%>%
# filter(sum>3) %>%
# arrange(desc(sum)) %>% wordcloud2(minSize = 3)
# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(li_leader3$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(li_leader3$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
一號領袖shared的文字雲圖
shared:[新聞]、[爆卦]分享者
和Wojnarowski相似,貼文內容以[新聞]轉貼或[爆卦]類文章為主,特別是民調動態,或總統參選局勢為主。故會出現「參選」、「初選」等關鍵字。內容偏向批評蔡英文民調落後、蔡造成執政前期民進黨支持度下降,或是以專家斷言方式支持賴清德出選。
二號領袖youhow0418的文字雲圖
youhow0418:[新聞]分享者
分享內容為新聞轉貼居多,內容與shared相似,但兼著重賴清德的拉抬與對蔡英文的質疑,如賴清德民調領先、蔡英文對華航罷工的處理等。
三號領袖luke7212的文字雲圖
luke7212:[臉書]、[新聞]分享者
發文數較少,主要在臉書貼文分享,或新聞轉貼,對賴清德的支持風向沒那麼明顯,比較偏向蔡、賴兩個都支持,而肯定賴清德的施政成績。
從上述分析可以發現,以上三位帳號發文多以臉書貼文、新聞轉貼,或民調等強烈政黨暗示的貼文為主,而幾乎無個人意見的抒發或評論,推估爭議性新聞較容易引發網民迴響,接著分析其情緒組成。
# 載入stop words字典
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]
stop_words <- data.frame(word = stop_words)
colnames(stop_words) = c("word")
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]
# 載入negation words字典
negation_words = c("不是","不","未","未必","毫不","決不","沒有","沒","還沒有","還沒","還不","從來沒有","從來沒","從來不","從不","非","不會","不要","不行","無法")
# 把stop words中的negation words移掉
stop_words <- stop_words[!(stop_words %in% negation_words)]
# 載入斷詞字典
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]
use_dict<- data.frame(word = use_dict)
colnames(use_dict) = c("nega_word")
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]
# 載入liwc情緒字典
p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
# 這裏不加入stop word字典(清掉的話會影響bigram結果)
jieba_tokenizer = worker()
# 使用字典重新斷詞,把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(use_dict,negation_words))
## [1] TRUE
# unnest_tokens 使用的bigram分詞函數,並執行bigram分詞
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(unlist(tokens), 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
li_3 = li %>% filter(artPoster=="shared"|artPoster=="youhow0418"|artPoster=="luke7212")
devotion_bigram <- li_3 %>%
unnest_tokens(bigram,sentence, token = jieba_bigram)
# 將bigram拆成word1和word2,並將包含英文字母或和數字的詞彙清除
bigrams_separated <- devotion_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ")
# 並選出word2爲情緒詞的bigram
devotion_sentiment_bigrams <- bigrams_separated %>%
filter(!word1 %in% stop_words) %>%
filter(!word2 %in% stop_words) %>%
inner_join(LIWC_ch, by = c(word2 = "word"))
# 選出word2中,有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予: 情緒標籤爲"positive"、情緒值爲 1
# 如果是負面詞彙則賦予: 情緒標籤爲"negative"、情緒值爲 -1
devotion_sentiment_bigrams1 <- devotion_sentiment_bigrams %>%
select(artUrl,artDate,artPoster, word1, word2) %>%
mutate(sentiment=ifelse(word2 %in% positive,1,-1), sentiment_tag=ifelse(word2 %in% positive, "positive", "negative"))
# 生成一個時間段中的 日期和情緒標籤的所有可能組合
all_dates <-
expand.grid(seq(as.Date(min(devotion_sentiment_bigrams1$artDate)), as.Date(max(devotion_sentiment_bigrams1$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")
# 反轉前面是否定詞且後面爲情緒詞彙的組合
devotion_sentiment_bigrams_negated <- devotion_sentiment_bigrams1 %>%
mutate(sentiment=ifelse(word1 %in% negation_words, (-1)*sentiment, sentiment)) %>%
mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))
# 計算我們資料集中每日的情緒值
negated_sentiment_plot_data <- devotion_sentiment_bigrams_negated %>%
group_by(artUrl,artDate,artPoster,sentiment_tag,sentiment) %>%
summarise(count=n())
# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值,會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>%
merge(negated_sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c("artDate", "sentiment_tag"),
all.x=T,all.y=T) %>%
mutate(count = replace_na(count, 0))
# 最後把圖畫出來
negated_sentiment_plot_data=negated_sentiment_plot_data %>%
mutate(sentiment.y = replace_na(sentiment.y, 0)) %>%
mutate(count1 = sentiment.y * count)
negated_sentiment_plot_data1 =negated_sentiment_plot_data %>%filter(!is.na(artPoster))
negated_sentiment_plot_data1 %>%
ggplot(aes(artDate,count1,fill=sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ artPoster, scales = "fixed") +
scale_x_date(labels = date_format("%m-%d"))
三名意見領袖:shared、youhow0418、luke7212
shared 文章攻擊性較強,較常出現情緒性字眼,以爭議性[新聞]、[爆卦]為主,大比例分享民調動態,也因發文量大,情緒分布較為密集,單篇正負情緒也較極端。
youhow0418 和shared相似,分享內容情緒強烈,特別是抨擊蔡英文施政的部分,也因多為爭議性新聞而正負面情緒都相當高。
luke7212 發文量少,情緒分布較為分散,多為新聞或臉書正向肯定的內容居多,偶爾提到負面情緒字眼,以正面情緒為主。
對照關聯圖,shared、youhow0418、luke7212,雖然發文頻率不同,但整體多為推文為主,顯示其言論雖情緒起伏較大,但獲相同觀點的網友廣泛支持。但相較其他發文量較小的意見領袖,招致的噓文也較多。
# 選出需要的欄位
tsai_r <- tsai_r[,c(4,7,8,10)]
colnames(tsai_r)=c("artUrl", "cmtPoster", "cmtStatus"," cmtContent")
# 發文者數量 1590
length(unique(tsai$artPoster))
## [1] 1590
# 回覆者數量 38685
length(unique(tsai_r$cmtPoster))
## [1] 38685
# 總共有參與的人數 39115
allPoster <- c(tsai$artPoster, tsai_r$cmtPoster)
length(unique(allPoster)) #去掉重複
## [1] 39115
# 整理所有出現過的使用者
# 如果曾發過文的話就標註他爲poster;沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%tsai$artPoster, "poster", "replyer"))
# 把評論和文章依據artUrl innerJoin起來
tsai_all <- merge(x = tsai, y = tsai_r, by = "artUrl")
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- tsai_all %>%
dplyr::select(cmtPoster, artPoster, artUrl)
# 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西
# 建立網路關係
net <- graph_from_data_frame(d=link, directed=T)
因網路評論人數眾多,設定發文數大於5,回覆則數500以上,社群degree大於5,才會納入圖片討論。
篩掉4546,留下250個帳號
# 篩選回應數和發文次數
table(tsai$commentNum>=500)
##
## FALSE TRUE
## 4546 250
tsai_poster=table(tsai$artPoster) %>% sort %>% as.data.frame
colnames(tsai_poster)=c("artPoster","freq")
tsai_poster=tsai_poster %>% filter(freq>=5)
link <- tsai_all %>%
filter(commentNum >=500) %>% #回應數大於500則
filter(artPoster==tsai_poster$artPoster) %>% #發文次數>=5次
# filter(cmtStatus!="→") %>% # ptt篩出推噓
select(cmtPoster, artPoster, artUrl, cmtStatus)
## Warning in `==.default`(artPoster, tsai_poster$artPoster): 較長的物件長度並
## 非較短物件長度的倍數
## Warning in is.na(e1) | is.na(e2): 較長的物件長度並非較短物件長度的倍數
# 這邊要篩選link中有出現的使用者(否則沒有在link中出現的使用者也會被igraph畫上去,沒有意義)
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
# # 建立網路關係
# set.seed(487)
# net <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# # DEGREE大於20 將印出LABEL否則則無
# labels <- degree(net)
# V(net)$label <- names(labels)
# V(net)$color <- ifelse(V(net)$type=="poster", "gold", "lightblue")
#
# # 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
# E(net)$color <- ifelse(E(net)$cmtStatus == "推", "lightgreen", "palevioletred")
#
# plot(net, vertex.size=2, edge.arrow.size=.2,
# vertex.label=ifelse(degree(net) > 10, V(net)$label, NA), vertex.label.ces=.5)
# # 加入標示
# legend(x=-1.5, y=1, c("發文者","回文者"), pch=21,
# col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1,cex=1,text.width=1,x.intersp=-2,adj=1,y.intersp=1,bty="n")
#
# legend(x=-2, y=0, c("推","噓"),
# col=c("lightgreen","palevioletred"), lty=1, cex=1,
# text.width=3,x.intersp=0,adj=2,y.intersp=1,bty="n")
「英流」的社群網絡圖(以回文數500區分)
「英流」的社群網絡圖(加上推噓)
我們抓出前三名意見領袖:Wojnarowski、cheinshin、TWOOOOOOOOOO
蔡英文與賴清德網路社群分布情況類似,其中有部分帳號重疊,如Wojnarowski、shared、TWOOOOOOOOOO,推估可能因對民進黨關心者皆會提及兩者消息動態,只是在兩邊社群分布角色不同。
如發文內容、情緒、頻率
# 一號(這5個月發了96次文)
tsai_leader1 = tsai %>% filter(artPoster=="Wojnarowski")
summary(tsai_leader1)
## artTitle artDate artTime
## Length:96 Length:96 Length:96
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## artUrl artPoster artCat commentNum
## Length:96 Length:96 Length:96 Min. : 24.0
## Class :character Class :character Class :character 1st Qu.: 225.2
## Mode :character Mode :character Mode :character Median : 346.0
## Mean : 415.7
## 3rd Qu.: 489.5
## Max. :1478.0
## push boo sentence
## Min. : 8.0 Min. : 1.00 Length:96
## 1st Qu.: 152.2 1st Qu.: 29.75 Class :character
## Median : 239.0 Median : 50.50 Mode :character
## Mean : 267.7 Mean : 62.57
## 3rd Qu.: 343.2 3rd Qu.: 72.25
## Max. :1017.0 Max. :446.00
tsai_leader1$artDate = as.Date(tsai_leader1$artDate)
tsai_leader1= tsai_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader1time = tsai_leader1 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="Wojnarowski" )
# 二號(這5個月發了44次文)
tsai_leader2 = tsai %>% filter(artPoster=="cheinshin")
summary(tsai_leader2)
## artTitle artDate artTime
## Length:44 Length:44 Length:44
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## artUrl artPoster artCat commentNum
## Length:44 Length:44 Length:44 Min. : 13.0
## Class :character Class :character Class :character 1st Qu.: 71.5
## Mode :character Mode :character Mode :character Median :274.0
## Mean :317.4
## 3rd Qu.:472.8
## Max. :927.0
## push boo sentence
## Min. : 3.00 Min. : 2.00 Length:44
## 1st Qu.: 32.75 1st Qu.: 16.25 Class :character
## Median :198.50 Median : 33.00 Mode :character
## Mean :202.61 Mean : 52.34
## 3rd Qu.:295.50 3rd Qu.: 68.75
## Max. :657.00 Max. :284.00
tsai_leader2$artDate = as.Date(tsai_leader2$artDate)
tsai_leader2= tsai_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader2time = tsai_leader2 %>%group_by(months) %>%
summarise( num=n()) %>%
as.data.frame%>%
mutate( poster ="cheinshin" )
#三號(這5個月發了27次文)
tsai_leader3 = tsai %>% filter(artPoster=="TWOOOOOOOOOO")
summary(tsai_leader3)
## artTitle artDate artTime
## Length:27 Length:27 Length:27
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## artUrl artPoster artCat commentNum
## Length:27 Length:27 Length:27 Min. : 18.0
## Class :character Class :character Class :character 1st Qu.: 38.0
## Mode :character Mode :character Mode :character Median : 81.0
## Mean : 194.8
## 3rd Qu.: 177.5
## Max. :1099.0
## push boo sentence
## Min. : 3.0 Min. : 1.00 Length:27
## 1st Qu.: 14.5 1st Qu.: 10.50 Class :character
## Median : 21.0 Median : 24.00 Mode :character
## Mean :102.1 Mean : 36.48
## 3rd Qu.: 68.5 3rd Qu.: 45.00
## Max. :745.0 Max. :188.00
tsai_leader3$artDate = as.Date(tsai_leader3$artDate)
tsai_leader3= tsai_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader3time = tsai_leader3 %>%group_by(months) %>%
summarise(num=n()) %>%
as.data.frame%>%
mutate( poster ="TWOOOOOOOOOO" )
# 整合他們的發文趨勢圖
tsai_leader = rbind(tsai_leader1time,tsai_leader2time,tsai_leader3time)
tsai_leader %>% ggplot(aes(x= months,y=num,fill=poster)) +geom_bar(stat = "identity")+
facet_wrap(~poster, ncol = 2, scales = "free")
從圖中可以發現,在三名的發文者中。Wojnarowski、cheinshin、TWOOOOOOOOOO發文頻率幾乎從二月開始明顯上升,但較為分散,沒有特別集中哪幾個月份的趨勢,其中TWOOOOOOOOOO發文頻率隨月份越來越高,可能因為接近選舉,網路討論熱度上升所致。
# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(tsai_leader1$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(tsai_leader1$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# devotion_words %>%
# group_by(word) %>%
# summarise(sum = n())%>%
# filter(sum>3) %>%
# arrange(desc(sum)) %>% wordcloud2(minSize = 3)
# 二號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(tsai_leader2$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(tsai_leader2$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# devotion_words %>%
# group_by(word) %>%
# summarise(sum = n())%>%
# filter(sum>2) %>%
# arrange(desc(sum)) %>% wordcloud2(minSize = 3)
# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(tsai_leader3$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(tsai_leader3$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# devotion_words %>%
# group_by(word) %>%
# summarise(sum = n())%>%
# filter(sum>2) %>%
# arrange(desc(sum)) %>% wordcloud2(minSize = 3)
一號領袖Wojnarowski的文字雲圖
Wojnarowski(96):[新聞]分享者 大部分是新聞報導,或者是轉貼蔡英文的臉書貼文居多,分享內容多元。包含蔡英文的外交政策(反對一國兩制、友邦交流)、施政成效上的宣傳、探訪國軍等視察活動,對韓國瑜政策的批評,甚至還有到嘉義吃虎兒油飯等較為輕鬆的臉書轉貼。
二號領袖cheinshin的文字雲圖
cheinshin(44):[臉書]、[新聞]分享者
以臉書貼文、新聞分享為主,其中又以蔡英文臉書居多,分享的內容在一般關注的台美政策、選舉相關新聞外,也加入地方視察等較不容易見報的總統行程,或是提醒勿信line假消息等動態,有蔡英文臉書傳聲筒意味。
三號領袖TWOOOOOOOOOO的文字雲圖
TWOOOOOOOOOO(27):[新聞]分享者
發布的貼文清一色為其他新聞媒體,內容著重在其他政黨對蔡英文政策批評的相關報導,如外交政策、九二共識、國內施政成效等,正反皆會評論。估計因其轉貼內容較有爭議性故引發較多迴響。
接著來看看發文者的用詞情緒下手
# 載入stop words字典
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]
stop_words <- data.frame(word = stop_words)
colnames(stop_words) = c("word")
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]
# 載入negation words字典
negation_words = c("不是","不","未","未必","毫不","決不","沒有","沒","還沒有","還沒","還不","從來沒有","從來沒","從來不","從不","非","不會","不要","不行","無法")
# 把stop words中的negation words移掉
stop_words <- stop_words[!(stop_words %in% negation_words)]
# 載入斷詞字典
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]
use_dict<- data.frame(word = use_dict)
colnames(use_dict) = c("nega_word")
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]
# 載入liwc情緒字典
p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
# 這裏不加入stop word字典(清掉的話會影響bigram結果)
jieba_tokenizer = worker()
# 使用還願字典重新斷詞,把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(use_dict,negation_words))
## [1] TRUE
# unnest_tokens 使用的bigram分詞函數,並執行bigram分詞
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(unlist(tokens), 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
tsai_3 = tsai %>% filter(artPoster=="Wojnarowski"|artPoster=="cheinshin"|artPoster=="TWOOOOOOOOOO")
devotion_bigram <- tsai_3 %>%
unnest_tokens(bigram,sentence, token = jieba_bigram)
# 將bigram拆成word1和word2,並將包含英文字母或和數字的詞彙清除
bigrams_separated <- devotion_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ")
# 並選出word2爲情緒詞的bigram
devotion_sentiment_bigrams <- bigrams_separated %>%
filter(!word1 %in% stop_words) %>%
filter(!word2 %in% stop_words) %>%
inner_join(LIWC_ch, by = c(word2 = "word"))
# 選出word2中,有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予: 情緒標籤爲"positive"、情緒值爲 1
# 如果是負面詞彙則賦予: 情緒標籤爲"negative"、情緒值爲 -1
devotion_sentiment_bigrams1 <- devotion_sentiment_bigrams %>%
select(artUrl,artDate,artPoster, word1, word2) %>%
mutate(sentiment=ifelse(word2 %in% positive,1,-1), sentiment_tag=ifelse(word2 %in% positive, "positive", "negative"))
# 生成一個時間段中的 日期和情緒標籤的所有可能組合
all_dates <-
expand.grid(seq(as.Date(min(devotion_sentiment_bigrams1$artDate)), as.Date(max(devotion_sentiment_bigrams1$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")
# 反轉前面是否定詞且後面爲情緒詞彙的組合
devotion_sentiment_bigrams_negated <- devotion_sentiment_bigrams1 %>%
mutate(sentiment=ifelse(word1 %in% negation_words, (-1)*sentiment, sentiment)) %>%
mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))
# 計算我們資料集中每日的情緒值
negated_sentiment_plot_data <- devotion_sentiment_bigrams_negated %>%
group_by(artUrl,artDate,artPoster,sentiment_tag,sentiment) %>%
summarise(count=n())
# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值,會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>%
merge(negated_sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c("artDate", "sentiment_tag"),
all.x=T,all.y=T) %>%
mutate(count = replace_na(count, 0))
# 最後把圖畫出來
negated_sentiment_plot_data=negated_sentiment_plot_data %>%
mutate(sentiment.y = replace_na(sentiment.y, 0)) %>%
mutate(count1 = sentiment.y * count)
negated_sentiment_plot_data1 =negated_sentiment_plot_data %>%filter(!is.na(artPoster))
negated_sentiment_plot_data1 %>%
ggplot(aes(artDate,count1,fill=sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ artPoster, scales = "fixed") +
scale_x_date(labels = date_format("%m-%d"))
三名意見領袖:
Wojnarowski因發文數高,故情緒分布也較密集,是蔡英文的熱烈粉絲,分享的內容多為蔡英文遭受外界質疑的回應、對特定政策或的批評或說明外交傾向,爭議性內容導致正負面情緒同時增加,另外正面政策宣導使正面情緒上升,總體上是正面大於負面的趨勢。
cheinshin身為一接近蔡英文臉書傳聲筒的腳色,正面宣傳政績或總統走訪動態的文類居多,故以正面情緒居多。
TWOOOOOOOOOO發文頻率較少,故情緒分布較不密集,但內容多以爭議性文章居多,故相對情緒分布較為兩極。
han1 <- han1 %>%
select(artUrl, commentPoster, commentStatus, commentContent)
length(unique(han$artPoster))# 發文者數量2538
## [1] 2538
length(unique(han1$commentPoster))# 回覆者數量45419
## [1] 45419
allPoster <- c(han$artPoster, han1$commentPoster)#總共有參與的人數46078
length(unique(allPoster))
## [1] 46078
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%han$artPoster, "poster", "replyer"))
# 把評論和文章依據artUrl innerJoin起來
han_all <- merge(x = han, y = han1, by = "artUrl")
因網路評論人數眾多,設定發文數大於10,回覆則數500以上,才會納入圖片討論。
篩掉9543個帳號,留下常活動的230個。
# 篩選回應數和發文次數
table(han$commentNum>=500)
##
## FALSE TRUE
## 9543 230
han_poster=table(han$artPoster) %>% sort %>% as.data.frame
colnames(han_poster)=c("artPoster","freq")
han_poster=han_poster %>% filter(freq>=10)
han_link <- han_all %>%
filter(commentNum >=500) %>% #回應數大於500則
filter(artPoster==han_poster$artPoster) %>% #發文次數>10次
filter(commentStatus!="→") %>% # ptt篩出推噓
select(commentPoster, artPoster, artUrl, commentStatus)
## Warning in `==.default`(artPoster, han_poster$artPoster): 較長的物件長度並
## 非較短物件長度的倍數
## Warning in is.na(e1) | is.na(e2): 較長的物件長度並非較短物件長度的倍數
# 這邊要篩選link中有出現的使用者(否則沒有在link中出現的使用者也會被igraph畫上去,沒有意義)
han_filtered_user <- userList %>%
filter(user%in%han_link $commentPoster | user%in%han_link $artPoster) %>% arrange(desc(type))
# # 建立網路關係
# set.seed(388)
# net <- graph_from_data_frame(d=han_link, v=han_filtered_user, directed=F)
# # DEGREE大於10 將印出LABEL否則則無
# labels <- degree(net)
# V(net)$label <- names(labels)
#
# V(net)$color <- ifelse(V(net)$type=="poster", "gold", "lightblue")
# # 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
# E(net)$color <- ifelse(E(net)$commentStatus == "推", "lightgreen", "palevioletred")
#
# plot(net, vertex.size=2, edge.arrow.size=.2,
# vertex.label=ifelse(degree(net) > 10, V(net)$label, NA), vertex.label.ces=.5)
# # 加入標示
# legend(x=-2, y=-0.2, c("發文者","回文者"), pch=21,
# col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=2,
# text.width=0.02,x.intersp=0.7,adj=1,y.intersp=0.1,bty="n")
# legend(x=-2., y=1, c("推","噓"),
# col=c("lightgreen","palevioletred"), lty=1, cex=2,
# text.width=0.02,x.intersp=0.7,adj=1,y.intersp=0.1,bty="n")
「韓國瑜」的社群網絡圖
韓國瑜的網路關聯圖,可以發現各集團間較無關聯性,大大小小的集團之間的連結度不高。
「韓國瑜」的社群網絡圖(加入推噓)
再來我們用Degree(net)大於10的方法篩出來的貼文作者有七個,然後個我們抓出前三名意見領袖(degree(net)前三名):linhu8883324、Aptantion、shared。
如發文內容、情緒、頻率
# 一號(這五個月發了72次文)
han_leader1 = han %>% filter(artPoster=="linhu8883324")
summary(han_leader1)
## artTitle artDate artTime
## Length:72 Min. :2019-01-05 Length:72
## Class :character 1st Qu.:2019-02-16 Class1:hms
## Mode :character Median :2019-03-19 Class2:difftime
## Mean :2019-03-21 Mode :numeric
## 3rd Qu.:2019-05-02
## Max. :2019-05-30
## artUrl artPoster artCat
## Length:72 Length:72 Length:72
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## commentNum push boo sentence
## Min. : 17.00 Min. : 2.0 Min. : 2.00 Length:72
## 1st Qu.: 64.75 1st Qu.: 18.0 1st Qu.: 10.00 Class :character
## Median : 122.00 Median : 35.0 Median : 21.50 Mode :character
## Mean : 213.28 Mean :114.5 Mean : 36.36
## 3rd Qu.: 255.50 3rd Qu.:148.0 3rd Qu.: 54.25
## Max. :1467.00 Max. :941.0 Max. :203.00
han_leader1$artDate = as.Date(han_leader1$artDate)
han_leader1= han_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
han_leader1time = han_leader1 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="linhu8883324" )
# 二號(這五個月發了10次文)
han_leader2 = han %>% filter(artPoster=="Aptantion")
summary(han_leader2)
## artTitle artDate artTime
## Length:10 Min. :2019-01-04 Length:10
## Class :character 1st Qu.:2019-02-16 Class1:hms
## Mode :character Median :2019-03-14 Class2:difftime
## Mean :2019-03-14 Mode :numeric
## 3rd Qu.:2019-04-01
## Max. :2019-05-29
## artUrl artPoster artCat commentNum
## Length:10 Length:10 Length:10 Min. : 361.0
## Class :character Class :character Class :character 1st Qu.: 553.5
## Mode :character Mode :character Mode :character Median : 664.0
## Mean : 695.8
## 3rd Qu.: 768.5
## Max. :1216.0
## push boo sentence
## Min. :302.0 Min. : 19.0 Length:10
## 1st Qu.:424.0 1st Qu.: 35.5 Class :character
## Median :473.5 Median : 54.0 Mode :character
## Mean :491.4 Mean : 73.3
## 3rd Qu.:573.5 3rd Qu.: 71.0
## Max. :752.0 Max. :261.0
han_leader2$artDate = as.Date(han_leader2$artDate)
han_leader2= han_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
han_leader2time = han_leader2 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="Aptantion" )
#三號(這五個月發了36次文)
han_leader3 = han %>% filter(artPoster=="shared")
summary(han_leader3)
## artTitle artDate artTime
## Length:36 Min. :2019-01-18 Length:36
## Class :character 1st Qu.:2019-02-15 Class1:hms
## Mode :character Median :2019-04-17 Class2:difftime
## Mean :2019-04-03 Mode :numeric
## 3rd Qu.:2019-05-07
## Max. :2019-05-30
## artUrl artPoster artCat commentNum
## Length:36 Length:36 Length:36 Min. : 18.0
## Class :character Class :character Class :character 1st Qu.: 56.0
## Mode :character Mode :character Mode :character Median : 114.0
## Mean : 282.4
## 3rd Qu.: 321.0
## Max. :1479.0
## push boo sentence
## Min. : 5.00 Min. : 1.00 Length:36
## 1st Qu.: 16.75 1st Qu.: 12.00 Class :character
## Median : 50.50 Median : 28.00 Mode :character
## Mean :135.03 Mean : 44.86
## 3rd Qu.:162.25 3rd Qu.: 56.50
## Max. :759.00 Max. :191.00
han_leader3$artDate = as.Date(han_leader3$artDate)
han_leader3= han_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
han_leader3time = han_leader3 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="shared" )
# 整合他們的發文趨勢圖
han_leader = rbind(han_leader1time,han_leader2time,han_leader3time)
han_leader %>% ggplot(aes(x= months,y=num,fill=poster)) +geom_bar(stat = "identity")+
facet_wrap(~poster, ncol = 2, scales = "free")
首先來看degree前三名中的linhu8883324這個帳號,總發文數72篇,五月發最多篇,再來是Aptantion,總發文數10篇,發文數量很少卻有很多連結,是很不可思議的,而且月貼文都是兩篇,很有規律,最後是shared,總發文數36篇,貼文規律地往上增加。
# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(han_leader1$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(han_leader1$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_2.txt", stop_word = "dict/stop_word_2.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# devotion_words %>%
# group_by(word) %>%
# summarise(sum = n())%>%
# filter(sum>3) %>%
# arrange(desc(sum)) %>% wordcloud2(minSize = 3)
# 二號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(han_leader2$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(han_leader2$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_2.txt", stop_word = "dict/stop_word_2.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# devotion_words %>%
# group_by(word) %>%
# summarise(sum = n())%>%
# filter(sum>3) %>%
# arrange(desc(sum)) %>% wordcloud2(minSize = 3)
# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(han_leader3$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(han_leader3$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_2.txt", stop_word = "dict/stop_word_2.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# devotion_words %>%
# group_by(word) %>%
# summarise(sum = n())%>%
# filter(sum>3) %>%
# arrange(desc(sum)) %>% wordcloud2(minSize = 3)
一號意見領袖linhu8883324的文字雲圖
linhu8883324這個帳號看文字雲很明顯可以看到報導、記者等字,主要是轉貼各新聞網的新聞,但是這個帳號在貼文的最後加個備註,都是在砲轟或是酸韓國瑜的字詞,EX:唬爛、鬼混。EX:韓粉不意外,韓導又在惡搞等等,獲得很大的迴響。平均推數:114、平均噓數:36,都是被推爆的文,可以看出酸韓文在PTT上能獲得很多掌聲。
二號意見領袖Aptantion的文字雲圖
Aptantion這個帳號主要都是在爆韓國瑜的掛,韓導這個詞是酸韓國瑜都在演戲,所以佔了文字雲很大的版面。平均推數:491、平均噓數:73,由數據可發現,這種爆韓國瑜料的,打擊韓國瑜的文章更受網友喜歡。