抓出總統候選人的意見領袖（關聯圖）抓前三個意見領袖分析特質(如發文內容)；發文頻率； word2vector(一群網軍的：候選人之於xx)

載入套件

library(readr)

## Warning: package 'readr' was built under R version 3.5.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.3

library(jiebaR)

## Warning: package 'jiebaR' was built under R version 3.5.3

## Warning: package 'jiebaRD' was built under R version 3.5.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.5.3

library(tidytext)

## Warning: package 'tidytext' was built under R version 3.5.3

library(igraph)

## Warning: package 'igraph' was built under R version 3.5.3

library(stringr)

## Warning: package 'stringr' was built under R version 3.5.3

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.5.3

library(data.table)

## Warning: package 'data.table' was built under R version 3.5.3

library(igraph)
library(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 3.5.3

library(stringi)

## Warning: package 'stringi' was built under R version 3.5.3

library(widyr)

## Warning: package 'widyr' was built under R version 3.5.3

library(scales)

## Warning: package 'scales' was built under R version 3.5.3

require(tm)

## Warning: package 'tm' was built under R version 3.5.3

## Warning: package 'NLP' was built under R version 3.5.2

require(data.table)
require(quanteda)

## Warning: package 'quanteda' was built under R version 3.5.3

require(Matrix)
require(slam)

## Warning: package 'slam' was built under R version 3.5.2

require(Rtsne)

## Warning: package 'Rtsne' was built under R version 3.5.3

require(randomcoloR)

## Warning: package 'randomcoloR' was built under R version 3.5.3

library(magrittr)

## Warning: package 'magrittr' was built under R version 3.5.3

library(tsne)

## Warning: package 'tsne' was built under R version 3.5.2

讀入資料

讀取ptt賴清德和蔡英文的討論資料、討論串回覆資料（2019.1.1-2019.5.31）

li = fread("data/賴清德_articleMetaData.csv",encoding="UTF-8")
li_r = fread("data/賴清德_articleMetaData_response.csv",encoding="UTF-8")
tsai = fread("data/蔡英文_articleMetaData.csv",encoding="UTF-8")
tsai_r = fread("data/蔡英文_articleMetaData_response.csv",encoding="UTF-8")

賴清德的ptt社群網絡領袖分析

找出賴神的ptt意見領袖

整理並標記

# 選出需要的欄位
li_r <- li_r[,c(4,7,8,10)]
colnames(li_r)=c("artUrl", "cmtPoster", "cmtStatus"," cmtContent")
# 發文者數量 1830
length(unique(li$artPoster))

## [1] 1830

# 回覆者數量 40551
length(unique(li_r$cmtPoster))

## [1] 40551

# 總共有參與的人數 41019
allPoster <- c(li$artPoster, li_r$cmtPoster)
length(unique(allPoster)) #去掉重複

## [1] 41019

# 整理所有出現過的使用者
# 如果曾發過文的話就標註他爲poster；沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%li$artPoster, "poster", "replyer"))

建立社群網路圖

# 把評論和文章依據artUrl innerJoin起來
li_all <- merge(x = li, y = li_r, by = "artUrl")

# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- li_all %>%
      dplyr::select(cmtPoster, artPoster, artUrl)
# 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西

# 建立網路關係
net <- graph_from_data_frame(d=link, directed=T)

因網路評論人數眾多，設定發文數大於5，回覆則數500以上，社群degree大於5，才會納入圖片討論。

縮小範圍

# 篩選回應數和發文次數
table(li$commentNum>=500)

## 
## FALSE  TRUE 
##  6009   272

li_poster=table(li$artPoster) %>% sort %>% as.data.frame 
colnames(li_poster)=c("artPoster","freq")
li_poster=li_poster %>% filter(freq>=5) 

link <- li_all %>%
  filter(commentNum >=500) %>% #回應數大於500則
  filter(artPoster==li_poster$artPoster) %>% #發文次數>5次
  filter(cmtStatus!="→") %>%  # ptt篩出推噓
      select(cmtPoster, artPoster, artUrl, cmtStatus)

## Warning in `==.default`(artPoster, li_poster$artPoster): 較長的物件長度並非
## 較短物件長度的倍數

## Warning in is.na(e1) | is.na(e2): 較長的物件長度並非較短物件長度的倍數

# 這邊要篩選link中有出現的使用者(否則沒有在link中出現的使用者也會被igraph畫上去，沒有意義)
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>% 
          arrange(desc(type))

篩掉6009個帳號，留下常活動的272個。

畫出關聯圖

# 建立網路關係
 set.seed(487)
 net <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
 # DEGREE大於10 將印出LABEL否則則無
 labels <- degree(net)
 V(net)$label <- names(labels)
 
 V(net)$color <- ifelse(V(net)$type=="poster", "gold", "lightblue")
 # 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
 E(net)$color <- ifelse(E(net)$cmtStatus == "推", "lightgreen", "palevioletred")
 
 plot(net, vertex.size=2, edge.arrow.size=.2,
       vertex.label=ifelse(degree(net) > 7, V(net)$label, NA),  vertex.label.ces=.5)
  # 加入標示
  legend(x=-1.5, y=1, c("發文者","回文者"), pch=21,
         col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1,
         text.width=1,x.intersp=-2,adj=1,y.intersp=1,bty="n")

 legend(x=-2, y=0, c("推","噓"),
         col=c("lightgreen","palevioletred"), lty=1, cex=1,
         text.width=3,x.intersp=0,adj=2,y.intersp=1,bty="n")

「賴神」的社群網絡圖(加上推噓)

因為綠營支持者在回覆或發布消息時，支持蔡英文和賴清德會有所重疊，故賴清德的社群網路圖中，前幾名的意見領袖雖然會提及賴清德，但某些帳號經對比之後，較為支持小英(如Wojnarowski、TWOOOOOOOOOO等)，故分別抓出幾名較為支持賴清德的意見領袖作比較分析。

我們抓出三名意見領袖：shared、youhow0418、luke7212 雖然社群網路較為分散，但群體之間仍有一定程度連結，以上抓出帳號，不一定發文數或回應數最多，但相對其他帳號較能激起社群討論，引導風向。

意見領袖的特質

如發文內容、情緒、頻率

頻率

# 一號(這五個月發了51次文)
li_leader1 = li %>% filter(artPoster=="shared")
summary(li_leader1)

##    artTitle           artDate            artTime         
##  Length:51          Length:51          Length:51         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum    
##  Length:51          Length:51          Length:51          Min.   :  20.0  
##  Class :character   Class :character   Class :character   1st Qu.:  61.5  
##  Mode  :character   Mode  :character   Mode  :character   Median :  99.0  
##                                                           Mean   : 241.2  
##                                                           3rd Qu.: 268.0  
##                                                           Max.   :1479.0  
##       push            boo           sentence        
##  Min.   :  4.0   Min.   :  4.00   Length:51         
##  1st Qu.: 19.0   1st Qu.: 12.50   Class :character  
##  Median : 37.0   Median : 21.00   Mode  :character  
##  Mean   :110.1   Mean   : 41.35                     
##  3rd Qu.:115.0   3rd Qu.: 41.50                     
##  Max.   :759.0   Max.   :209.00

li_leader1$artDate = as.Date(li_leader1$artDate)
li_leader1= li_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader1time = li_leader1 %>%group_by(months) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="shared" ) 

#二號(這五個月發了25次文)
li_leader2 = li %>% filter(artPoster=="youhow0418")

summary(li_leader2)

##    artTitle           artDate            artTime         
##  Length:25          Length:25          Length:25         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum    
##  Length:25          Length:25          Length:25          Min.   :  13.0  
##  Class :character   Class :character   Class :character   1st Qu.:  65.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :  97.0  
##                                                           Mean   : 244.4  
##                                                           3rd Qu.: 328.0  
##                                                           Max.   :1474.0  
##       push            boo           sentence        
##  Min.   :  6.0   Min.   :  0.00   Length:25         
##  1st Qu.: 20.0   1st Qu.: 15.00   Class :character  
##  Median : 26.0   Median : 25.00   Mode  :character  
##  Mean   :116.6   Mean   : 37.24                     
##  3rd Qu.:159.0   3rd Qu.: 42.00                     
##  Max.   :675.0   Max.   :156.00

li_leader2$artDate = as.Date(li_leader2$artDate)
li_leader2= li_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader2time = li_leader2 %>%group_by(months) %>%
  summarise(num=n()) %>%
  as.data.frame%>%  
  mutate( poster ="youhow0418" ) 

#三號(這五個月發了16次文)
li_leader3 = li %>% filter(artPoster=="luke7212")

summary(li_leader3)

##    artTitle           artDate            artTime         
##  Length:16          Length:16          Length:16         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum    
##  Length:16          Length:16          Length:16          Min.   :  24.0  
##  Class :character   Class :character   Class :character   1st Qu.:  56.5  
##  Mode  :character   Mode  :character   Mode  :character   Median : 169.0  
##                                                           Mean   : 359.1  
##                                                           3rd Qu.: 645.8  
##                                                           Max.   :1056.0  
##       push             boo           sentence        
##  Min.   : 11.00   Min.   :  1.00   Length:16         
##  1st Qu.: 19.75   1st Qu.: 14.75   Class :character  
##  Median :101.00   Median : 27.50   Mode  :character  
##  Mean   :226.81   Mean   : 60.75                     
##  3rd Qu.:412.25   3rd Qu.: 73.25                     
##  Max.   :711.00   Max.   :335.00

li_leader3$artDate = as.Date(li_leader3$artDate)
li_leader3= li_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader3time = li_leader3 %>%group_by(months) %>%
  summarise(num=n()) %>%
  as.data.frame%>%  
  mutate( poster ="luke7212" ) 

# 整合他們的發文趨勢圖
li_leader = rbind(li_leader1time,li_leader2time,li_leader3time)
li_leader %>% ggplot(aes(x= months,y=num,fill=poster))  +geom_bar(stat = "identity")+
  facet_wrap(~poster, ncol = 2, scales = "fixed")

從圖中可以發現，三名發文者隨月份有越來越常發文的趨勢，特別集中於四月與五月，估計因逼近民進黨黨內初選，讓賴清德討論熱度升高，也為明年年初的總統大選做準備。接下來進行下一步的內容分析

內容

先從發文量明顯大於其他的前三位進行內容分析

# 一號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(li_leader1$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(li_leader1$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
# devotion_words %>%
#   group_by(word) %>%
#   summarise(sum = n())%>%
#   filter(sum>3)  %>%
#   arrange(desc(sum)) %>% wordcloud2(minSize = 3)

# 二號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(li_leader2$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(li_leader2$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
# devotion_words %>%
#  group_by(word) %>%
#   summarise(sum = n())%>%
#   filter(sum>3)  %>%
#   arrange(desc(sum)) %>% wordcloud2(minSize = 3)


# 三號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(li_leader3$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(li_leader3$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)

一號領袖shared的社群網絡圖

二號領袖youhow0418的社群網絡圖

三號領袖luke7212的社群網絡圖

shared(51)：[新聞]、[爆卦]分享者和Wojnarowski相似，貼文內容以[新聞]轉貼或[爆卦]類文章為主，特別是民調動態，或總統參選局勢為主。故會出現「參選」、「初選」等關鍵字。內容偏向批評蔡英文民調落後、蔡造成執政前期民進黨支持度下降，或是以專家斷言方式支持賴清德出選。

youhow0418：[新聞]分享者　分享內容為新聞轉貼居多，內容與shared相似，但兼著重賴清德的拉抬與對蔡英文的質疑，如賴清德民調領先、蔡英文對華航罷工的處理等。

luke7212：[臉書]、[新聞]分享者發文數較少，主要在臉書貼文分享，或新聞轉貼，對賴清德的支持風向沒那麼明顯，比較偏向蔡、賴兩個都支持，而肯定賴清德的施政成績。

從上述分析可以發現，以上三位帳號發文多以臉書貼文、新聞轉貼，或民調等強烈政黨暗示的貼文為主，而幾乎無個人意見的抒發或評論，推估爭議性新聞較容易引發網民迴響，接著分析其情緒組成

# 一號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(li$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(li$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)

words <- strsplit(devotion_words$word,"[。！；？!?;]")

情緒

# 載入stop words字典
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]
stop_words <- data.frame(word = stop_words)
colnames(stop_words) = c("word")
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]

# 載入negation words字典

negation_words = c("不是","不","未","未必","毫不","決不","沒有","沒","還沒有","還沒","還不","從來沒有","從來沒","從來不","從不","非","不會","不要","不行","無法")

# 把stop words中的negation words移掉
stop_words <- stop_words[!(stop_words %in% negation_words)]

# 載入斷詞字典
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]
use_dict<- data.frame(word = use_dict)
colnames(use_dict) = c("nega_word")
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]

# 載入liwc情緒字典
p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)

p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]


# 這裏不加入stop word字典(清掉的話會影響bigram結果)
jieba_tokenizer = worker()

# 使用字典重新斷詞，把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(use_dict,negation_words))

## [1] TRUE

# unnest_tokens 使用的bigram分詞函數，並執行bigram分詞
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(unlist(tokens), 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

li_3 = li %>% filter(artPoster=="shared"|artPoster=="youhow0418"|artPoster=="luke7212")
devotion_bigram <- li_3 %>%
  unnest_tokens(bigram,sentence, token = jieba_bigram)

# 將bigram拆成word1和word2，並將包含英文字母或和數字的詞彙清除
bigrams_separated <- devotion_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# 並選出word2爲情緒詞的bigram
devotion_sentiment_bigrams <- bigrams_separated %>%
  filter(!word1 %in% stop_words) %>%
  filter(!word2 %in% stop_words) %>%
  inner_join(LIWC_ch, by = c(word2 = "word"))   

# 選出word2中，有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予： 情緒標籤爲"positive"、情緒值爲  1
# 如果是負面詞彙則賦予： 情緒標籤爲"negative"、情緒值爲 -1
devotion_sentiment_bigrams1 <- devotion_sentiment_bigrams %>%
  select(artUrl,artDate,artPoster, word1, word2) %>%
  mutate(sentiment=ifelse(word2 %in% positive,1,-1), sentiment_tag=ifelse(word2 %in% positive, "positive", "negative"))

# 生成一個時間段中的 日期和情緒標籤的所有可能組合
all_dates <- 
  expand.grid(seq(as.Date(min(devotion_sentiment_bigrams1$artDate)), as.Date(max(devotion_sentiment_bigrams1$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")

# 反轉前面是否定詞且後面爲情緒詞彙的組合

devotion_sentiment_bigrams_negated <- devotion_sentiment_bigrams1 %>%
  mutate(sentiment=ifelse(word1 %in% negation_words, (-1)*sentiment, sentiment)) %>%
  mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))

# 計算我們資料集中每日的情緒值
negated_sentiment_plot_data <- devotion_sentiment_bigrams_negated %>%
  group_by(artUrl,artDate,artPoster,sentiment_tag,sentiment) %>%
  summarise(count=n())  

# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值，會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>% 
  merge(negated_sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c("artDate", "sentiment_tag"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))

# 最後把圖畫出來
negated_sentiment_plot_data=negated_sentiment_plot_data %>%
  mutate(sentiment.y = replace_na(sentiment.y, 0)) %>%
  mutate(count1 = sentiment.y * count)

negated_sentiment_plot_data1 =negated_sentiment_plot_data %>%filter(!is.na(artPoster))
negated_sentiment_plot_data1 %>%
  ggplot(aes(artDate,count1,fill=sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ artPoster, scales = "fixed") +
  scale_x_date(labels = date_format("%m-%d"))

三名意見領袖：shared、youhow0418、luke7212

shared 文章攻擊性較強，較常出現情緒性字眼，以爭議性[新聞]、[爆卦]為主，大比例分享民調動態，也因發文量大，情緒分布較為密集，單篇正負情緒也較極端。

youhow0418 和shared相似，分享內容情緒強烈，特別是抨擊蔡英文施政的部分，也因多為爭議性新聞而正負面情緒都相當高。

luke7212 發文量少，情緒分布較為分散，多為新聞或臉書正向肯定的內容居多，偶爾提到負面情緒字眼，以正面情緒為主。

對照關聯圖，shared、youhow0418、luke7212，雖然發文頻率不同，但整體多為推文為主，顯示其言論雖情緒起伏較大，但獲相同觀點的網友廣泛支持。但相較其他發文量較小的意見領袖，招致的噓文也較多。

蔡英文的的ptt社群網絡領袖分析

找出蔡英文的ptt意見領袖

整理並標記

# 選出需要的欄位
tsai_r <- tsai_r[,c(4,7,8,10)]
colnames(tsai_r)=c("artUrl", "cmtPoster", "cmtStatus"," cmtContent")
# 發文者數量 1590
length(unique(tsai$artPoster))

## [1] 1590

# 回覆者數量 38685
length(unique(tsai_r$cmtPoster))

## [1] 38685

# 總共有參與的人數 39115
allPoster <- c(tsai$artPoster, tsai_r$cmtPoster)
length(unique(allPoster)) #去掉重複

## [1] 39115

# 整理所有出現過的使用者
# 如果曾發過文的話就標註他爲poster；沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%tsai$artPoster, "poster", "replyer"))

建立社群網路圖

# 把評論和文章依據artUrl innerJoin起來
tsai_all <- merge(x = tsai, y = tsai_r, by = "artUrl")

# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- tsai_all %>%
      dplyr::select(cmtPoster, artPoster, artUrl)
# 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西

# 建立網路關係
net <- graph_from_data_frame(d=link, directed=T)

因網路評論人數眾多，設定發文數大於5，回覆則數500以上，社群degree大於5，才會納入圖片討論。

縮小範圍

# 篩選回應數和發文次數
# 賴清德的文章相較蔡英文熱門,把篩選標準調評論數到大於600才畫在圖上
table(tsai$commentNum>=500)

## 
## FALSE  TRUE 
##  4546   250

tsai_poster=table(tsai$artPoster) %>% sort %>% as.data.frame 
colnames(tsai_poster)=c("artPoster","freq")
tsai_poster=tsai_poster %>% filter(freq>=5) 

link <- tsai_all %>%
  filter(commentNum >=500) %>% #回應數大於500則
  filter(artPoster==tsai_poster$artPoster) %>% #發文次數>=5次
  filter(cmtStatus!="→") %>%  # ptt篩出推噓
      select(cmtPoster, artPoster, artUrl, cmtStatus)

## Warning in `==.default`(artPoster, tsai_poster$artPoster): 較長的物件長度並
## 非較短物件長度的倍數

## Warning in is.na(e1) | is.na(e2): 較長的物件長度並非較短物件長度的倍數

# 這邊要篩選link中有出現的使用者(否則沒有在link中出現的使用者也會被igraph畫上去，沒有意義)
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>% 
          arrange(desc(type))

篩掉4546，留下250個帳號

畫出關聯圖

 # 建立網路關係
 set.seed(487)
  net <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
  # DEGREE大於20 將印出LABEL否則則無
  labels <- degree(net)
  V(net)$label <- names(labels)
  
  V(net)$color <- ifelse(V(net)$type=="poster", "gold", "lightblue")
  # 依據回覆發生的文章所對應的主題，對他們的關聯線進行上色
  #E(net)$color <- ifelse(E(net)$cmtStatus == "推", "lightgreen", "palevioletred")
#   
#   plot(net, vertex.size=2, edge.arrow.size=.2,
#        vertex.label=ifelse(degree(net) > 10, V(net)$label, NA),  vertex.label.ces=.5)
#  # # 加入標示
#   legend(x=-1.5, y=1, c("發文者","回文者"), pch=21,
#          col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1,cex=1,text.width=1,x.intersp=-2,adj=1,y.intersp=1,bty="n")
# 
# legend(x=-2, y=0, c("推","噓"), 
#         col=c("lightgreen","palevioletred"), lty=1, cex=1,
#         text.width=3,x.intersp=0,adj=2,y.intersp=1,bty="n")

「英流」的社群網絡圖(以回文數500區分)
「英流」的社群網絡圖(加上推噓)

我們抓出前三名意見領袖：Wojnarowski、cheinshin、TWOOOOOOOOOO 蔡英文與賴清德網路社群分布情況類似，其中有部分帳號重疊，如Wojnarowski、shared、TWOOOOOOOOOO，推估可能因對民進黨關心者皆會提及兩者消息動態，只是在兩邊社群分布角色不同。

意見領袖的特質

如發文內容、情緒、頻率

頻率

# 一號(這5個月發了96次文)
tsai_leader1 = tsai %>% filter(artPoster=="Wojnarowski")
summary(tsai_leader1)

##    artTitle           artDate            artTime         
##  Length:96          Length:96          Length:96         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum    
##  Length:96          Length:96          Length:96          Min.   :  24.0  
##  Class :character   Class :character   Class :character   1st Qu.: 225.2  
##  Mode  :character   Mode  :character   Mode  :character   Median : 346.0  
##                                                           Mean   : 415.7  
##                                                           3rd Qu.: 489.5  
##                                                           Max.   :1478.0  
##       push             boo           sentence        
##  Min.   :   8.0   Min.   :  1.00   Length:96         
##  1st Qu.: 152.2   1st Qu.: 29.75   Class :character  
##  Median : 239.0   Median : 50.50   Mode  :character  
##  Mean   : 267.7   Mean   : 62.57                     
##  3rd Qu.: 343.2   3rd Qu.: 72.25                     
##  Max.   :1017.0   Max.   :446.00

tsai_leader1$artDate = as.Date(tsai_leader1$artDate)
tsai_leader1= tsai_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader1time = tsai_leader1 %>%group_by(months) %>%
  summarise(num=n()) %>% as.data.frame %>%  
  mutate( poster ="Wojnarowski" ) 


# 二號(這5個月發了44次文)
tsai_leader2 = tsai %>% filter(artPoster=="cheinshin")
summary(tsai_leader2)

##    artTitle           artDate            artTime         
##  Length:44          Length:44          Length:44         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum   
##  Length:44          Length:44          Length:44          Min.   : 13.0  
##  Class :character   Class :character   Class :character   1st Qu.: 71.5  
##  Mode  :character   Mode  :character   Mode  :character   Median :274.0  
##                                                           Mean   :317.4  
##                                                           3rd Qu.:472.8  
##                                                           Max.   :927.0  
##       push             boo           sentence        
##  Min.   :  3.00   Min.   :  2.00   Length:44         
##  1st Qu.: 32.75   1st Qu.: 16.25   Class :character  
##  Median :198.50   Median : 33.00   Mode  :character  
##  Mean   :202.61   Mean   : 52.34                     
##  3rd Qu.:295.50   3rd Qu.: 68.75                     
##  Max.   :657.00   Max.   :284.00

tsai_leader2$artDate = as.Date(tsai_leader2$artDate)
tsai_leader2= tsai_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader2time = tsai_leader2 %>%group_by(months) %>%
  summarise( num=n()) %>%
  as.data.frame%>%  
  mutate( poster ="cheinshin" ) 

#三號(這5個月發了27次文)
tsai_leader3 = tsai %>% filter(artPoster=="TWOOOOOOOOOO")
summary(tsai_leader3)

##    artTitle           artDate            artTime         
##  Length:27          Length:27          Length:27         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     artUrl           artPoster            artCat            commentNum    
##  Length:27          Length:27          Length:27          Min.   :  18.0  
##  Class :character   Class :character   Class :character   1st Qu.:  38.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :  81.0  
##                                                           Mean   : 194.8  
##                                                           3rd Qu.: 177.5  
##                                                           Max.   :1099.0  
##       push            boo           sentence        
##  Min.   :  3.0   Min.   :  1.00   Length:27         
##  1st Qu.: 14.5   1st Qu.: 10.50   Class :character  
##  Median : 21.0   Median : 24.00   Mode  :character  
##  Mean   :102.1   Mean   : 36.48                     
##  3rd Qu.: 68.5   3rd Qu.: 45.00                     
##  Max.   :745.0   Max.   :188.00

tsai_leader3$artDate = as.Date(tsai_leader3$artDate)
tsai_leader3= tsai_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader3time = tsai_leader3 %>%group_by(months) %>%
  summarise(num=n()) %>%
  as.data.frame%>%  
  mutate( poster ="TWOOOOOOOOOO" ) 

# 整合他們的發文趨勢圖
tsai_leader = rbind(tsai_leader1time,tsai_leader2time,tsai_leader3time)

tsai_leader %>% ggplot(aes(x= months,y=num,fill=poster))  +geom_bar(stat = "identity")+
  facet_wrap(~poster, ncol = 2, scales = "free")

從圖中可以發現，在三名的發文者中。Wojnarowski、cheinshin、TWOOOOOOOOOO發文頻率幾乎從二月開始明顯上升，但較為分散，沒有特別集中哪幾個月份的趨勢，其中TWOOOOOOOOOO發文頻率隨月份越來越高，可能因為接近選舉，網路討論熱度上升所致。

內容

# 一號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(tsai_leader1$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(tsai_leader1$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
 # devotion_words %>%
 #   group_by(word) %>%
 #   summarise(sum = n())%>%
 #   filter(sum>3)  %>%
 #   arrange(desc(sum)) %>% wordcloud2(minSize = 3)

# 二號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(tsai_leader2$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(tsai_leader2$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)
 # devotion_words %>%
 #   group_by(word) %>%
 #   summarise(sum = n())%>%
 #   filter(sum>2)  %>%
 #   arrange(desc(sum)) %>% wordcloud2(minSize = 3)


# 三號
# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(tsai_leader3$sentence,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
devotion_sentences <- data.frame(
  id = rep(tsai_leader3$artUrl,sapply(devotion_sentences, length)), 
  sentence = unlist(devotion_sentences)) %>%
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞，並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(id, word, sort = TRUE)

# devotion_words %>%
#   group_by(word) %>%
#   summarise(sum = n())%>%
#   filter(sum>2)  %>%
#   arrange(desc(sum)) %>% wordcloud2(minSize = 3)

一號領袖shared的社群網絡圖
二號領袖youhow0418的社群網絡圖三號領袖Anddyliu的社群網絡圖 > Wojnarowski(leader1)：[新聞]分享者 > 大部分是新聞報導，或者是轉貼蔡英文的臉書貼文居多，分享內容多元。包含蔡英文的外交政策(反對一國兩制、友邦交流)、施政成效上的宣傳、探訪國軍等視察活動，對韓國瑜政策的批評，甚至還有到嘉義吃虎兒油飯等較為輕鬆的臉書轉貼。

cheinshin(44)：[臉書]、[新聞]分享者以臉書貼文、新聞分享為主，其中又以蔡英文臉書居多，分享的內容在一般關注的台美政策、選舉相關新聞外，也加入地方視察等較不容易見報的總統行程，或是提醒勿信line假消息等動態，有蔡英文臉書傳聲筒意味。

TWOOOOOOOOOO(27):[新聞]分享者發布的貼文清一色為其他新聞媒體，內容著重在其他政黨對蔡英文政策批評的相關報導，如外交政策、九二共識、國內施政成效等，正反皆會評論。估計因其轉貼內容較有爭議性故引發較多迴響。

接著來看看發文者的用詞情緒下手

情緒

# 載入stop words字典
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]
stop_words <- data.frame(word = stop_words)
colnames(stop_words) = c("word")
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]

# 載入negation words字典

negation_words = c("不是","不","未","未必","毫不","決不","沒有","沒","還沒有","還沒","還不","從來沒有","從來沒","從來不","從不","非","不會","不要","不行","無法")

# 把stop words中的negation words移掉
stop_words <- stop_words[!(stop_words %in% negation_words)]

# 載入斷詞字典
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]
use_dict<- data.frame(word = use_dict)
colnames(use_dict) = c("nega_word")
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]

# 載入liwc情緒字典
p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)

p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]


# 這裏不加入stop word字典(清掉的話會影響bigram結果)
jieba_tokenizer = worker()

# 使用還願字典重新斷詞，把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(use_dict,negation_words))

## [1] TRUE

# unnest_tokens 使用的bigram分詞函數，並執行bigram分詞
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(unlist(tokens), 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

tsai_3 = tsai %>% filter(artPoster=="Wojnarowski"|artPoster=="cheinshin"|artPoster=="TWOOOOOOOOOO")
devotion_bigram <- tsai_3 %>%
  unnest_tokens(bigram,sentence, token = jieba_bigram)

# 將bigram拆成word1和word2，並將包含英文字母或和數字的詞彙清除
bigrams_separated <- devotion_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# 並選出word2爲情緒詞的bigram
devotion_sentiment_bigrams <- bigrams_separated %>%
  filter(!word1 %in% stop_words) %>%
  filter(!word2 %in% stop_words) %>%
  inner_join(LIWC_ch, by = c(word2 = "word"))   

# 選出word2中，有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予： 情緒標籤爲"positive"、情緒值爲  1
# 如果是負面詞彙則賦予： 情緒標籤爲"negative"、情緒值爲 -1
devotion_sentiment_bigrams1 <- devotion_sentiment_bigrams %>%
  select(artUrl,artDate,artPoster, word1, word2) %>%
  mutate(sentiment=ifelse(word2 %in% positive,1,-1), sentiment_tag=ifelse(word2 %in% positive, "positive", "negative"))

# 生成一個時間段中的 日期和情緒標籤的所有可能組合
all_dates <- 
  expand.grid(seq(as.Date(min(devotion_sentiment_bigrams1$artDate)), as.Date(max(devotion_sentiment_bigrams1$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")

# 反轉前面是否定詞且後面爲情緒詞彙的組合

devotion_sentiment_bigrams_negated <- devotion_sentiment_bigrams1 %>%
  mutate(sentiment=ifelse(word1 %in% negation_words, (-1)*sentiment, sentiment)) %>%
  mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))

# 計算我們資料集中每日的情緒值
negated_sentiment_plot_data <- devotion_sentiment_bigrams_negated %>%
  group_by(artUrl,artDate,artPoster,sentiment_tag,sentiment) %>%
  summarise(count=n())  

# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值，會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>% 
  merge(negated_sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c("artDate", "sentiment_tag"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))

# 最後把圖畫出來
negated_sentiment_plot_data=negated_sentiment_plot_data %>%
  mutate(sentiment.y = replace_na(sentiment.y, 0)) %>%
  mutate(count1 = sentiment.y * count)

negated_sentiment_plot_data1 =negated_sentiment_plot_data %>%filter(!is.na(artPoster))
negated_sentiment_plot_data1 %>%
  ggplot(aes(artDate,count1,fill=sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ artPoster, scales = "fixed") +
  scale_x_date(labels = date_format("%m-%d"))

三名意見領袖：
Wojnarowski因發文數高，故情緒分布也較密集，是蔡英文的熱烈粉絲，分享的內容多為蔡英文遭受外界質疑的回應、對特定政策或的批評或說明外交傾向，爭議性內容導致正負面情緒同時增加，另外正面政策宣導使正面情緒上升，總體上是正面大於負面的趨勢。

cheinshin身為一接近蔡英文臉書傳聲筒的腳色，正面宣傳政績或總統走訪動態的文類居多，故以正面情緒居多。而TWOOOOOOOOOO發文頻率較少，故情緒分布較不密集，但內容多以爭議性文章居多，故相對情緒分布較為兩極。

分析2019年1月至5月賴清德與蔡英文ptt文章網路社群關係

劉岱宜

2019/05/13

載入套件

讀入資料

賴清德的ptt社群網絡領袖分析

找出賴神的ptt意見領袖

整理並標記

建立社群網路圖

縮小範圍

畫出關聯圖

意見領袖的特質

頻率

內容

情緒

蔡英文的的ptt社群網絡領袖分析

找出蔡英文的ptt意見領袖

整理並標記

建立社群網路圖

縮小範圍

畫出關聯圖

意見領袖的特質

頻率

內容

情緒