抓出總統候選人的意見領袖(關聯圖) 抓前三個意見領袖分析 特質(如發文內容);發文頻率; word2vector(一群網軍的:候選人之於xx)

載入套件

library(readr)
## Warning: package 'readr' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.5.3
## Warning: package 'jiebaRD' was built under R version 3.5.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.3
library(igraph)
## Warning: package 'igraph' was built under R version 3.5.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
library(igraph)
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.5.3
library(stringi)
## Warning: package 'stringi' was built under R version 3.5.3
library(widyr)
## Warning: package 'widyr' was built under R version 3.5.3
library(scales)
## Warning: package 'scales' was built under R version 3.5.3
require(tm)
## Warning: package 'tm' was built under R version 3.5.3
## Warning: package 'NLP' was built under R version 3.5.2
require(data.table)
require(quanteda)
## Warning: package 'quanteda' was built under R version 3.5.3
require(Matrix)
require(slam)
## Warning: package 'slam' was built under R version 3.5.2
require(Rtsne)
## Warning: package 'Rtsne' was built under R version 3.5.3
require(randomcoloR)
## Warning: package 'randomcoloR' was built under R version 3.5.3
library(magrittr)
## Warning: package 'magrittr' was built under R version 3.5.3
library(tsne)
## Warning: package 'tsne' was built under R version 3.5.2

讀入資料

讀取ptt賴清德和蔡英文的討論資料、討論串回覆資料(2019.1.1-2019.5.31)

li = fread("data/賴清德_articleMetaData.csv",encoding="UTF-8")
li_r = fread("data/賴清德_articleMetaData_response.csv",encoding="UTF-8")
tsai = fread("data/蔡英文_articleMetaData.csv",encoding="UTF-8")
tsai_r = fread("data/蔡英文_articleMetaData_response.csv",encoding="UTF-8")

賴清德的ptt社群網絡領袖分析

找出賴神的ptt意見領袖

整理並標記

# 選出需要的欄位
li_r <- li_r[,c(4,7,8,10)]
colnames(li_r)=c("artUrl", "cmtPoster", "cmtStatus"," cmtContent")
# 發文者數量 1830
length(unique(li$artPoster))
## [1] 1830
# 回覆者數量 40551
length(unique(li_r$cmtPoster))
## [1] 40551
# 總共有參與的人數 41019
allPoster <- c(li$artPoster, li_r$cmtPoster)
length(unique(allPoster)) #去掉重複
## [1] 41019
# 整理所有出現過的使用者
# 如果曾發過文的話就標註他爲poster;沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%li$artPoster, "poster", "replyer"))

建立社群網路圖

# 把評論和文章依據artUrl innerJoin起來
li_all <- merge(x = li, y = li_r, by = "artUrl")
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- li_all %>%
      dplyr::select(cmtPoster, artPoster, artUrl)
# 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西

# 建立網路關係
net <- graph_from_data_frame(d=link, directed=T)

因網路評論人數眾多,設定發文數大於5,回覆則數500以上,社群degree大於5,才會納入圖片討論。

縮小範圍

# 篩選回應數和發文次數
table(li$commentNum>=500)
## 
## FALSE  TRUE 
##  6009   272
li_poster=table(li$artPoster) %>% sort %>% as.data.frame 
colnames(li_poster)=c("artPoster","freq")
li_poster=li_poster %>% filter(freq>=5) 

link <- li_all %>%
  filter(commentNum >=500) %>% #回應數大於500則
  filter(artPoster==li_poster$artPoster) %>% #發文次數>5次
  filter(cmtStatus!="→") %>%  # ptt篩出推噓
      select(cmtPoster, artPoster, artUrl, cmtStatus) 
## Warning in `==.default`(artPoster, li_poster$artPoster): 較長的物件長度並非
## 較短物件長度的倍數
## Warning in is.na(e1) | is.na(e2): 較長的物件長度並非較短物件長度的倍數
# 這邊要篩選link中有出現的使用者(否則沒有在link中出現的使用者也會被igraph畫上去,沒有意義)
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>% 
          arrange(desc(type))

篩掉6009個帳號,留下常活動的272個。

畫出關聯圖

# 建立網路關係
 set.seed(487)
 net <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
 # DEGREE大於10 將印出LABEL否則則無
 labels <- degree(net)
 V(net)$label <- names(labels)
 
 V(net)$color <- ifelse(V(net)$type=="poster", "gold", "lightblue")
 # 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
 E(net)$color <- ifelse(E(net)$cmtStatus == "推", "lightgreen", "palevioletred")
 
 plot(net, vertex.size=2, edge.arrow.size=.2,
       vertex.label=ifelse(degree(net) > 7, V(net)$label, NA),  vertex.label.ces=.5)
  # 加入標示
  legend(x=-1.5, y=1, c("發文者","回文者"), pch=21,
         col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1,
         text.width=1,x.intersp=-2,adj=1,y.intersp=1,bty="n")

 legend(x=-2, y=0, c("推","噓"),
         col=c("lightgreen","palevioletred"), lty=1, cex=1,
         text.width=3,x.intersp=0,adj=2,y.intersp=1,bty="n")

「賴神」的社群網絡圖
「賴神」的社群網絡圖(加上推噓)

因為綠營支持者在回覆或發布消息時,支持蔡英文和賴清德會有所重疊,故賴清德的社群網路圖中,前幾名的意見領袖雖然會提及賴清德,但某些帳號經對比之後,較為支持小英(如Wojnarowski、TWOOOOOOOOOO等),故分別抓出幾名較為支持賴清德的意見領袖作比較分析。

我們抓出三名意見領袖:shared、youhow0418、luke7212 雖然社群網路較為分散,但群體之間仍有一定程度連結,以上抓出帳號,不一定發文數或回應數最多,但相對其他帳號較能激起社群討論,引導風向。

意見領袖的特質

如發文內容、情緒、頻率

頻率

# 一號(這五個月發了51次文)
li_leader1 = li %>% filter(artPoster=="shared")
summary(li_leader1)
##    artTitle           artDate            artTime         
##  Length:51          Length:51          Length:51         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum    
##  Length:51          Length:51          Length:51          Min.   :  20.0  
##  Class :character   Class :character   Class :character   1st Qu.:  61.5  
##  Mode  :character   Mode  :character   Mode  :character   Median :  99.0  
##                                                           Mean   : 241.2  
##                                                           3rd Qu.: 268.0  
##                                                           Max.   :1479.0  
##       push            boo           sentence        
##  Min.   :  4.0   Min.   :  4.00   Length:51         
##  1st Qu.: 19.0   1st Qu.: 12.50   Class :character  
##  Median : 37.0   Median : 21.00   Mode  :character  
##  Mean   :110.1   Mean   : 41.35                     
##  3rd Qu.:115.0   3rd Qu.: 41.50                     
##  Max.   :759.0   Max.   :209.00
li_leader1$artDate = as.Date(li_leader1$artDate)
li_leader1= li_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader1time = li_leader1 %>%group_by(months) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="shared" ) 

#二號(這五個月發了25次文)
li_leader2 = li %>% filter(artPoster=="youhow0418")

summary(li_leader2)
##    artTitle           artDate            artTime         
##  Length:25          Length:25          Length:25         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum    
##  Length:25          Length:25          Length:25          Min.   :  13.0  
##  Class :character   Class :character   Class :character   1st Qu.:  65.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :  97.0  
##                                                           Mean   : 244.4  
##                                                           3rd Qu.: 328.0  
##                                                           Max.   :1474.0  
##       push            boo           sentence        
##  Min.   :  6.0   Min.   :  0.00   Length:25         
##  1st Qu.: 20.0   1st Qu.: 15.00   Class :character  
##  Median : 26.0   Median : 25.00   Mode  :character  
##  Mean   :116.6   Mean   : 37.24                     
##  3rd Qu.:159.0   3rd Qu.: 42.00                     
##  Max.   :675.0   Max.   :156.00
li_leader2$artDate = as.Date(li_leader2$artDate)
li_leader2= li_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader2time = li_leader2 %>%group_by(months) %>%
  summarise(num=n()) %>%
  as.data.frame%>%  
  mutate( poster ="youhow0418" ) 

#三號(這五個月發了16次文)
li_leader3 = li %>% filter(artPoster=="luke7212")

summary(li_leader3)
##    artTitle           artDate            artTime         
##  Length:16          Length:16          Length:16         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum    
##  Length:16          Length:16          Length:16          Min.   :  24.0  
##  Class :character   Class :character   Class :character   1st Qu.:  56.5  
##  Mode  :character   Mode  :character   Mode  :character   Median : 169.0  
##                                                           Mean   : 359.1  
##                                                           3rd Qu.: 645.8  
##                                                           Max.   :1056.0  
##       push             boo           sentence        
##  Min.   : 11.00   Min.   :  1.00   Length:16         
##  1st Qu.: 19.75   1st Qu.: 14.75   Class :character  
##  Median :101.00   Median : 27.50   Mode  :character  
##  Mean   :226.81   Mean   : 60.75                     
##  3rd Qu.:412.25   3rd Qu.: 73.25                     
##  Max.   :711.00   Max.   :335.00
li_leader3$artDate = as.Date(li_leader3$artDate)
li_leader3= li_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader3time = li_leader3 %>%group_by(months) %>%
  summarise(num=n()) %>%
  as.data.frame%>%  
  mutate( poster ="luke7212" ) 

# 整合他們的發文趨勢圖
li_leader = rbind(li_leader1time,li_leader2time,li_leader3time)
li_leader %>% ggplot(aes(x= months,y=num,fill=poster))  +geom_bar(stat = "identity")+
  facet_wrap(~poster, ncol = 2, scales = "fixed") 

從圖中可以發現,三名發文者隨月份有越來越常發文的趨勢,特別集中於四月與五月,估計因逼近民進黨黨內初選,讓賴清德討論熱度升高,也為明年年初的總統大選做準備。 接下來進行下一步的內容分析

內容

先從發文量明顯大於其他的前三位進行內容分析

# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(li_leader1$sentence,"[。!;?!?;]")

# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(li_leader1$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
# devotion_words %>%
#   group_by(word) %>%
#   summarise(sum = n())%>%
#   filter(sum>3)  %>%
#   arrange(desc(sum)) %>% wordcloud2(minSize = 3)

# 二號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(li_leader2$sentence,"[。!;?!?;]")

# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(li_leader2$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
# devotion_words %>%
#  group_by(word) %>%
#   summarise(sum = n())%>%
#   filter(sum>3)  %>%
#   arrange(desc(sum)) %>% wordcloud2(minSize = 3)


# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(li_leader3$sentence,"[。!;?!?;]")

# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(li_leader3$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
一號領袖shared的社群網絡圖

一號領袖shared的社群網絡圖

二號領袖youhow0418的社群網絡圖

二號領袖youhow0418的社群網絡圖

三號領袖luke7212的社群網絡圖

三號領袖luke7212的社群網絡圖

shared(51):[新聞]、[爆卦]分享者 和Wojnarowski相似,貼文內容以[新聞]轉貼或[爆卦]類文章為主,特別是民調動態,或總統參選局勢為主。故會出現「參選」、「初選」等關鍵字。內容偏向批評蔡英文民調落後、蔡造成執政前期民進黨支持度下降,或是以專家斷言方式支持賴清德出選。

youhow0418:[新聞]分享者   分享內容為新聞轉貼居多,內容與shared相似,但兼著重賴清德的拉抬與對蔡英文的質疑,如賴清德民調領先、蔡英文對華航罷工的處理等。

luke7212:[臉書]、[新聞]分享者 發文數較少,主要在臉書貼文分享,或新聞轉貼,對賴清德的支持風向沒那麼明顯,比較偏向蔡、賴兩個都支持,而肯定賴清德的施政成績。

從上述分析可以發現,以上三位帳號發文多以臉書貼文、新聞轉貼,或民調等強烈政黨暗示的貼文為主,而幾乎無個人意見的抒發或評論,推估爭議性新聞較容易引發網民迴響,接著分析其情緒組成

# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(li$sentence,"[。!;?!?;]")

# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(li$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)

words <- strsplit(devotion_words$word,"[。!;?!?;]")

情緒

# 載入stop words字典
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]
stop_words <- data.frame(word = stop_words)
colnames(stop_words) = c("word")
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]

# 載入negation words字典

negation_words = c("不是","不","未","未必","毫不","決不","沒有","沒","還沒有","還沒","還不","從來沒有","從來沒","從來不","從不","非","不會","不要","不行","無法")

# 把stop words中的negation words移掉
stop_words <- stop_words[!(stop_words %in% negation_words)]

# 載入斷詞字典
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]
use_dict<- data.frame(word = use_dict)
colnames(use_dict) = c("nega_word")
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]

# 載入liwc情緒字典
p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)

p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]


# 這裏不加入stop word字典(清掉的話會影響bigram結果)
jieba_tokenizer = worker()

# 使用字典重新斷詞,把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(use_dict,negation_words))
## [1] TRUE
# unnest_tokens 使用的bigram分詞函數,並執行bigram分詞
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(unlist(tokens), 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

li_3 = li %>% filter(artPoster=="shared"|artPoster=="youhow0418"|artPoster=="luke7212")
devotion_bigram <- li_3 %>%
  unnest_tokens(bigram,sentence, token = jieba_bigram)

# 將bigram拆成word1和word2,並將包含英文字母或和數字的詞彙清除
bigrams_separated <- devotion_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# 並選出word2爲情緒詞的bigram
devotion_sentiment_bigrams <- bigrams_separated %>%
  filter(!word1 %in% stop_words) %>%
  filter(!word2 %in% stop_words) %>%
  inner_join(LIWC_ch, by = c(word2 = "word"))   

# 選出word2中,有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予: 情緒標籤爲"positive"、情緒值爲  1
# 如果是負面詞彙則賦予: 情緒標籤爲"negative"、情緒值爲 -1
devotion_sentiment_bigrams1 <- devotion_sentiment_bigrams %>%
  select(artUrl,artDate,artPoster, word1, word2) %>%
  mutate(sentiment=ifelse(word2 %in% positive,1,-1), sentiment_tag=ifelse(word2 %in% positive, "positive", "negative"))

# 生成一個時間段中的 日期和情緒標籤的所有可能組合
all_dates <- 
  expand.grid(seq(as.Date(min(devotion_sentiment_bigrams1$artDate)), as.Date(max(devotion_sentiment_bigrams1$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")

# 反轉前面是否定詞且後面爲情緒詞彙的組合

devotion_sentiment_bigrams_negated <- devotion_sentiment_bigrams1 %>%
  mutate(sentiment=ifelse(word1 %in% negation_words, (-1)*sentiment, sentiment)) %>%
  mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))

# 計算我們資料集中每日的情緒值
negated_sentiment_plot_data <- devotion_sentiment_bigrams_negated %>%
  group_by(artUrl,artDate,artPoster,sentiment_tag,sentiment) %>%
  summarise(count=n())  

# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值,會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>% 
  merge(negated_sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c("artDate", "sentiment_tag"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))

# 最後把圖畫出來
negated_sentiment_plot_data=negated_sentiment_plot_data %>%
  mutate(sentiment.y = replace_na(sentiment.y, 0)) %>%
  mutate(count1 = sentiment.y * count)

negated_sentiment_plot_data1 =negated_sentiment_plot_data %>%filter(!is.na(artPoster))
negated_sentiment_plot_data1 %>%
  ggplot(aes(artDate,count1,fill=sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ artPoster, scales = "fixed") +
  scale_x_date(labels = date_format("%m-%d"))

三名意見領袖:shared、youhow0418、luke7212

shared 文章攻擊性較強,較常出現情緒性字眼,以爭議性[新聞]、[爆卦]為主,大比例分享民調動態,也因發文量大,情緒分布較為密集,單篇正負情緒也較極端。

youhow0418 和shared相似,分享內容情緒強烈,特別是抨擊蔡英文施政的部分,也因多為爭議性新聞而正負面情緒都相當高。

luke7212 發文量少,情緒分布較為分散,多為新聞或臉書正向肯定的內容居多,偶爾提到負面情緒字眼,以正面情緒為主。

對照關聯圖,shared、youhow0418、luke7212,雖然發文頻率不同,但整體多為推文為主,顯示其言論雖情緒起伏較大,但獲相同觀點的網友廣泛支持。但相較其他發文量較小的意見領袖,招致的噓文也較多。

蔡英文的的ptt社群網絡領袖分析

找出蔡英文的ptt意見領袖

整理並標記

# 選出需要的欄位
tsai_r <- tsai_r[,c(4,7,8,10)]
colnames(tsai_r)=c("artUrl", "cmtPoster", "cmtStatus"," cmtContent")
# 發文者數量 1590
length(unique(tsai$artPoster))
## [1] 1590
# 回覆者數量 38685
length(unique(tsai_r$cmtPoster))
## [1] 38685
# 總共有參與的人數 39115
allPoster <- c(tsai$artPoster, tsai_r$cmtPoster)
length(unique(allPoster)) #去掉重複
## [1] 39115
# 整理所有出現過的使用者
# 如果曾發過文的話就標註他爲poster;沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%tsai$artPoster, "poster", "replyer"))

建立社群網路圖

# 把評論和文章依據artUrl innerJoin起來
tsai_all <- merge(x = tsai, y = tsai_r, by = "artUrl")

# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- tsai_all %>%
      dplyr::select(cmtPoster, artPoster, artUrl)
# 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西

# 建立網路關係
net <- graph_from_data_frame(d=link, directed=T)

因網路評論人數眾多,設定發文數大於5,回覆則數500以上,社群degree大於5,才會納入圖片討論。

縮小範圍

# 篩選回應數和發文次數
# 賴清德的文章相較蔡英文熱門,把篩選標準調評論數到大於600才畫在圖上
table(tsai$commentNum>=500)
## 
## FALSE  TRUE 
##  4546   250
tsai_poster=table(tsai$artPoster) %>% sort %>% as.data.frame 
colnames(tsai_poster)=c("artPoster","freq")
tsai_poster=tsai_poster %>% filter(freq>=5) 

link <- tsai_all %>%
  filter(commentNum >=500) %>% #回應數大於500則
  filter(artPoster==tsai_poster$artPoster) %>% #發文次數>=5次
  filter(cmtStatus!="→") %>%  # ptt篩出推噓
      select(cmtPoster, artPoster, artUrl, cmtStatus) 
## Warning in `==.default`(artPoster, tsai_poster$artPoster): 較長的物件長度並
## 非較短物件長度的倍數
## Warning in is.na(e1) | is.na(e2): 較長的物件長度並非較短物件長度的倍數
# 這邊要篩選link中有出現的使用者(否則沒有在link中出現的使用者也會被igraph畫上去,沒有意義)
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>% 
          arrange(desc(type))

篩掉4546,留下250個帳號

畫出關聯圖

 # 建立網路關係
 set.seed(487)
  net <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
  # DEGREE大於20 將印出LABEL否則則無
  labels <- degree(net)
  V(net)$label <- names(labels)
  
  V(net)$color <- ifelse(V(net)$type=="poster", "gold", "lightblue")
  # 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
  #E(net)$color <- ifelse(E(net)$cmtStatus == "推", "lightgreen", "palevioletred")
#   
#   plot(net, vertex.size=2, edge.arrow.size=.2,
#        vertex.label=ifelse(degree(net) > 10, V(net)$label, NA),  vertex.label.ces=.5)
#  # # 加入標示
#   legend(x=-1.5, y=1, c("發文者","回文者"), pch=21,
#          col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1,cex=1,text.width=1,x.intersp=-2,adj=1,y.intersp=1,bty="n")
# 
# legend(x=-2, y=0, c("推","噓"), 
#         col=c("lightgreen","palevioletred"), lty=1, cex=1,
#         text.width=3,x.intersp=0,adj=2,y.intersp=1,bty="n")

「英流」的社群網絡圖(以回文數500區分)
「英流」的社群網絡圖(加上推噓)

我們抓出前三名意見領袖:Wojnarowski、cheinshin、TWOOOOOOOOOO 蔡英文與賴清德網路社群分布情況類似,其中有部分帳號重疊,如Wojnarowski、shared、TWOOOOOOOOOO,推估可能因對民進黨關心者皆會提及兩者消息動態,只是在兩邊社群分布角色不同。

意見領袖的特質

如發文內容、情緒、頻率

頻率

# 一號(這5個月發了96次文)
tsai_leader1 = tsai %>% filter(artPoster=="Wojnarowski")
summary(tsai_leader1)
##    artTitle           artDate            artTime         
##  Length:96          Length:96          Length:96         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum    
##  Length:96          Length:96          Length:96          Min.   :  24.0  
##  Class :character   Class :character   Class :character   1st Qu.: 225.2  
##  Mode  :character   Mode  :character   Mode  :character   Median : 346.0  
##                                                           Mean   : 415.7  
##                                                           3rd Qu.: 489.5  
##                                                           Max.   :1478.0  
##       push             boo           sentence        
##  Min.   :   8.0   Min.   :  1.00   Length:96         
##  1st Qu.: 152.2   1st Qu.: 29.75   Class :character  
##  Median : 239.0   Median : 50.50   Mode  :character  
##  Mean   : 267.7   Mean   : 62.57                     
##  3rd Qu.: 343.2   3rd Qu.: 72.25                     
##  Max.   :1017.0   Max.   :446.00
tsai_leader1$artDate = as.Date(tsai_leader1$artDate)
tsai_leader1= tsai_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader1time = tsai_leader1 %>%group_by(months) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="Wojnarowski" ) 


# 二號(這5個月發了44次文)
tsai_leader2 = tsai %>% filter(artPoster=="cheinshin")
summary(tsai_leader2)
##    artTitle           artDate            artTime         
##  Length:44          Length:44          Length:44         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum   
##  Length:44          Length:44          Length:44          Min.   : 13.0  
##  Class :character   Class :character   Class :character   1st Qu.: 71.5  
##  Mode  :character   Mode  :character   Mode  :character   Median :274.0  
##                                                           Mean   :317.4  
##                                                           3rd Qu.:472.8  
##                                                           Max.   :927.0  
##       push             boo           sentence        
##  Min.   :  3.00   Min.   :  2.00   Length:44         
##  1st Qu.: 32.75   1st Qu.: 16.25   Class :character  
##  Median :198.50   Median : 33.00   Mode  :character  
##  Mean   :202.61   Mean   : 52.34                     
##  3rd Qu.:295.50   3rd Qu.: 68.75                     
##  Max.   :657.00   Max.   :284.00
tsai_leader2$artDate = as.Date(tsai_leader2$artDate)
tsai_leader2= tsai_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader2time = tsai_leader2 %>%group_by(months) %>%
  summarise( num=n()) %>%
  as.data.frame%>%  
  mutate( poster ="cheinshin" ) 

#三號(這5個月發了27次文)
tsai_leader3 = tsai %>% filter(artPoster=="TWOOOOOOOOOO")
summary(tsai_leader3)
##    artTitle           artDate            artTime         
##  Length:27          Length:27          Length:27         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum    
##  Length:27          Length:27          Length:27          Min.   :  18.0  
##  Class :character   Class :character   Class :character   1st Qu.:  38.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :  81.0  
##                                                           Mean   : 194.8  
##                                                           3rd Qu.: 177.5  
##                                                           Max.   :1099.0  
##       push            boo           sentence        
##  Min.   :  3.0   Min.   :  1.00   Length:27         
##  1st Qu.: 14.5   1st Qu.: 10.50   Class :character  
##  Median : 21.0   Median : 24.00   Mode  :character  
##  Mean   :102.1   Mean   : 36.48                     
##  3rd Qu.: 68.5   3rd Qu.: 45.00                     
##  Max.   :745.0   Max.   :188.00
tsai_leader3$artDate = as.Date(tsai_leader3$artDate)
tsai_leader3= tsai_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader3time = tsai_leader3 %>%group_by(months) %>%
  summarise(num=n()) %>%
  as.data.frame%>%  
  mutate( poster ="TWOOOOOOOOOO" ) 

# 整合他們的發文趨勢圖
tsai_leader = rbind(tsai_leader1time,tsai_leader2time,tsai_leader3time)

tsai_leader %>% ggplot(aes(x= months,y=num,fill=poster))  +geom_bar(stat = "identity")+
  facet_wrap(~poster, ncol = 2, scales = "free") 

從圖中可以發現,在三名的發文者中。Wojnarowski、cheinshin、TWOOOOOOOOOO發文頻率幾乎從二月開始明顯上升,但較為分散,沒有特別集中哪幾個月份的趨勢,其中TWOOOOOOOOOO發文頻率隨月份越來越高,可能因為接近選舉,網路討論熱度上升所致。

內容

# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(tsai_leader1$sentence,"[。!;?!?;]")

# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(tsai_leader1$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
 # devotion_words %>%
 #   group_by(word) %>%
 #   summarise(sum = n())%>%
 #   filter(sum>3)  %>%
 #   arrange(desc(sum)) %>% wordcloud2(minSize = 3)

# 二號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(tsai_leader2$sentence,"[。!;?!?;]")

# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(tsai_leader2$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
 # devotion_words %>%
 #   group_by(word) %>%
 #   summarise(sum = n())%>%
 #   filter(sum>2)  %>%
 #   arrange(desc(sum)) %>% wordcloud2(minSize = 3)


# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(tsai_leader3$sentence,"[。!;?!?;]")

# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(tsai_leader3$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)

# devotion_words %>%
#   group_by(word) %>%
#   summarise(sum = n())%>%
#   filter(sum>2)  %>%
#   arrange(desc(sum)) %>% wordcloud2(minSize = 3)

一號領袖shared的社群網絡圖
二號領袖youhow0418的社群網絡圖 三號領袖Anddyliu的社群網絡圖 > Wojnarowski(leader1):[新聞]分享者 > 大部分是新聞報導,或者是轉貼蔡英文的臉書貼文居多,分享內容多元。包含蔡英文的外交政策(反對一國兩制、友邦交流)、施政成效上的宣傳、探訪國軍等視察活動,對韓國瑜政策的批評,甚至還有到嘉義吃虎兒油飯等較為輕鬆的臉書轉貼。

cheinshin(44):[臉書]、[新聞]分享者 以臉書貼文、新聞分享為主,其中又以蔡英文臉書居多,分享的內容在一般關注的台美政策、選舉相關新聞外,也加入地方視察等較不容易見報的總統行程,或是提醒勿信line假消息等動態,有蔡英文臉書傳聲筒意味。

TWOOOOOOOOOO(27):[新聞]分享者 發布的貼文清一色為其他新聞媒體,內容著重在其他政黨對蔡英文政策批評的相關報導,如外交政策、九二共識、國內施政成效等,正反皆會評論。估計因其轉貼內容較有爭議性故引發較多迴響。

接著來看看發文者的用詞情緒下手

情緒

# 載入stop words字典
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]
stop_words <- data.frame(word = stop_words)
colnames(stop_words) = c("word")
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]

# 載入negation words字典

negation_words = c("不是","不","未","未必","毫不","決不","沒有","沒","還沒有","還沒","還不","從來沒有","從來沒","從來不","從不","非","不會","不要","不行","無法")

# 把stop words中的negation words移掉
stop_words <- stop_words[!(stop_words %in% negation_words)]

# 載入斷詞字典
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]
use_dict<- data.frame(word = use_dict)
colnames(use_dict) = c("nega_word")
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]

# 載入liwc情緒字典
p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)

p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]


# 這裏不加入stop word字典(清掉的話會影響bigram結果)
jieba_tokenizer = worker()

# 使用還願字典重新斷詞,把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(use_dict,negation_words))
## [1] TRUE
# unnest_tokens 使用的bigram分詞函數,並執行bigram分詞
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(unlist(tokens), 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

tsai_3 = tsai %>% filter(artPoster=="Wojnarowski"|artPoster=="cheinshin"|artPoster=="TWOOOOOOOOOO")
devotion_bigram <- tsai_3 %>%
  unnest_tokens(bigram,sentence, token = jieba_bigram)

# 將bigram拆成word1和word2,並將包含英文字母或和數字的詞彙清除
bigrams_separated <- devotion_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# 並選出word2爲情緒詞的bigram
devotion_sentiment_bigrams <- bigrams_separated %>%
  filter(!word1 %in% stop_words) %>%
  filter(!word2 %in% stop_words) %>%
  inner_join(LIWC_ch, by = c(word2 = "word"))   

# 選出word2中,有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予: 情緒標籤爲"positive"、情緒值爲  1
# 如果是負面詞彙則賦予: 情緒標籤爲"negative"、情緒值爲 -1
devotion_sentiment_bigrams1 <- devotion_sentiment_bigrams %>%
  select(artUrl,artDate,artPoster, word1, word2) %>%
  mutate(sentiment=ifelse(word2 %in% positive,1,-1), sentiment_tag=ifelse(word2 %in% positive, "positive", "negative"))

# 生成一個時間段中的 日期和情緒標籤的所有可能組合
all_dates <- 
  expand.grid(seq(as.Date(min(devotion_sentiment_bigrams1$artDate)), as.Date(max(devotion_sentiment_bigrams1$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")

# 反轉前面是否定詞且後面爲情緒詞彙的組合

devotion_sentiment_bigrams_negated <- devotion_sentiment_bigrams1 %>%
  mutate(sentiment=ifelse(word1 %in% negation_words, (-1)*sentiment, sentiment)) %>%
  mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))

# 計算我們資料集中每日的情緒值
negated_sentiment_plot_data <- devotion_sentiment_bigrams_negated %>%
  group_by(artUrl,artDate,artPoster,sentiment_tag,sentiment) %>%
  summarise(count=n())  

# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值,會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>% 
  merge(negated_sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c("artDate", "sentiment_tag"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))

# 最後把圖畫出來
negated_sentiment_plot_data=negated_sentiment_plot_data %>%
  mutate(sentiment.y = replace_na(sentiment.y, 0)) %>%
  mutate(count1 = sentiment.y * count)

negated_sentiment_plot_data1 =negated_sentiment_plot_data %>%filter(!is.na(artPoster))
negated_sentiment_plot_data1 %>%
  ggplot(aes(artDate,count1,fill=sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ artPoster, scales = "fixed") +
  scale_x_date(labels = date_format("%m-%d"))

三名意見領袖:
Wojnarowski因發文數高,故情緒分布也較密集,是蔡英文的熱烈粉絲,分享的內容多為蔡英文遭受外界質疑的回應、對特定政策或的批評或說明外交傾向,爭議性內容導致正負面情緒同時增加,另外正面政策宣導使正面情緒上升,總體上是正面大於負面的趨勢。

cheinshin身為一接近蔡英文臉書傳聲筒的腳色,正面宣傳政績或總統走訪動態的文類居多,故以正面情緒居多。而TWOOOOOOOOOO發文頻率較少,故情緒分布較不密集,但內容多以爭議性文章居多,故相對情緒分布較為兩極。