各家媒體九合一選前表現分析

零、整理並篩選資料
壹、主題分析:十二間媒體分佈傾向
貳、各家媒體用詞及情緒分布
叁、比較各政治傾向的媒體暨談論候選人（韓國瑜）文章的用詞
結語

動機與目的：

九合一大選後，大家開始思考網路、媒體聲量是否影響產生了巨大的力量，使這次選戰有了特別高的熱度與討論度？我們採用了政治所向Qsearch購買之2018年臉書發文資料來做分析，並選出資料量較高的十二大媒體，分析他們在這次選戰期間，是否對於選戰結果有了舉足輕重的影響；希望此次分析能夠對2020總統大選的聲量製造一些前言，並剖析各政黨該如何在網路聲量戰奪得先機，也提供給閱聽者一些媒體識讀上的建議。

零、整理並篩選資料

新頭殼newtalk
China Times/中視新聞/中天新聞52家族/中時即影音
聯合新聞網/聯合報
ETtoday筋斗雲/ETtoday新聞雲/東森新聞
三立新聞/三立新聞網
風傳媒
民報
自由時報
Yahoo!奇摩新聞
TVBS 少康戰情室/TVBS 新聞
民視新聞
蘋果日報/蘋果日報即時新聞
時間篩選：2018-06-01-2018-11-23(因此期間為選舉敏感時期，故挑選其作為分析區間)

# 載入csv資料
data06 = fread("201806_data.csv", encoding = "UTF-8")
data07 = fread("201807_data.csv", encoding = "UTF-8")
data08 = fread("201808_data.csv", encoding = "UTF-8")
data09 = fread("201809_data.csv", encoding = "UTF-8")
data10 = fread("201810_data.csv", encoding = "UTF-8")
data11 = fread("201811_data.csv", encoding = "UTF-8")

# 整理資料
data06<-unite(data06, id, Page_Name, Date, sep= "-",remove = F)
data07<-unite(data07, id, Page_Name, Date, sep= "-",remove = F)
data08<-unite(data08, id, Page_Name, Date, sep= "-",remove = F)
data09<-unite(data09, id, Page_Name, Date, sep= "-",remove = F)
data10<-unite(data10, id, Page_Name, Date, sep= "-",remove = F)
data11<-unite(data11, id, Page_Name, Date, sep= "-",remove = F)
df = rbind(data06,data07,data08,data09,data10,data11)
df$Page_ID=as.character(df$Page_ID)

# 篩出6/1~11/23
df$date= as.POSIXct(df$Date,format="%Y/%m/%d")
df =df %>% filter(date>="2018-06-01"& date<="2018-11-23")

# 排序並看哪些媒體資料最多
n= table(df$Page_Name) %>% sort  %>% as.data.frame

# 篩出我們要的媒體
df = df %>% 
  filter(Page_Name=="新頭殼newtalk"|Page_Name=="China Times"|Page_Name=="中視新聞"|Page_Name=="中天新聞52家族"|Page_Name=="中時即影音"|Page_Name=="聯合新聞網"|Page_Name=="聯合報"|Page_Name=="ETtoday筋斗雲"|Page_Name=="ETtoday新聞雲"|Page_Name=="東森新聞"|Page_Name=="三立新聞"|Page_Name=="三立新聞網"|Page_Name=="風傳媒"|Page_Name=="民報"|Page_Name=="自由時報"|Page_Name=="Yahoo!奇摩新聞"|Page_Name=="TVBS 少康戰情室"|Page_Name=="TVBS 新聞"|Page_Name=="民視新聞"|Page_Name=="政經看民視 民視看正晶"|Page_Name=="
蘋果日報 台灣"|Page_Name=="蘋果日報即時新聞")

# 存成rda檔
save(df,file="data.Rda")

安裝需要的packages

packages = c("readr","tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot","htmlwidgets","servr","gridExtra","corrplot","ggpubr","reshape2","quanteda","Matrix","slam","Rtsne","randomcoloR","wordcloud","wordcloud2","stringi","widyr","scales"
)
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

讀入套件

library(knitr)
library(dplyr)
library(kableExtra)
knitr::opts_chunk$set(echo = TRUE)
options(knitr.table.format = "html") 

#1
require(readr)
require(tm)
require(data.table)
require(stringr)
require(jiebaR)
library(jiebaRD)
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
require(LDAvis)
require(webshot)
require(htmlwidgets)
require(servr)
#2
library(gridExtra)
library(corrplot)
library(ggpubr)
require(reshape2)
#3
require(quanteda)
require(Matrix)
require(slam)
require(Rtsne)
require(randomcoloR)
require(wordcloud)
require(wordcloud2)
library(stringi)
library(widyr)
library(scales)

壹、LDA資料模型分析

load("data.rda")
df = df %>% select(id,Message)
colnames(df) = c("artUrl","Message")

斷句

sample_sentences <- strsplit(df$Message,"[。！；？!?;]")
sample_sentences<- data.frame(
                        artUrl = rep(df$artUrl, sapply(sample_sentences, length)), 
                        sentence = unlist(sample_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
sample_sentences$sentence <- as.character(sample_sentences$sentence)

Tokenization 與載入字典

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker(user = "lexicon/news_lexicon.txt",write  = "NOFILE",stop_word = "dict/stop_words.txt")

# new_user_word(jieba_tokenizer, c(lexicon))
# tokenize function
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
     tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}
tokens <- sample_sentences %>%
  unnest_tokens(word, sentence,token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)

加入識別文章的id,將資料轉換為Document Term Matrix (DTM)

news <- tokens %>%
  mutate(artId = group_indices(., artUrl))
news_dtm <-news %>% cast_dtm(artId, word, count)

建立LDA模型-尋找Topic的代表字

\(\phi\) Matrix

# news_lda <- LDA(news_dtm, k = 6, control = list(seed = 1234))
# news_topics <- tidy(news_lda, matrix = "beta") 
# save(news_lda,news_topics,file = "lda.Rda")
load("lda.Rda")

remove_words <- c("選戰", "新聞","頭殼","新頭殼","選舉","生活圈","市長","下載","直播","候選人","戰情","大家","點到","選情","內容","專屬")
news_top_terms <- news_topics %>%
  filter(! term %in% remove_words) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic,-beta)


news_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

一開始我們利用LDA模型將各家媒體在選舉時間的所有貼文做分組，總共分六組，從圖表可得知，這六個topic大部分都是與各直轄市長選舉有關，除了第五個topic之外，topic五的主題是一個叫做少康戰情室的政論節目，而第三第四主題都在討論高雄，由此可知高雄市長選舉在這次的選戰中佔了很大一部分的媒體版面。

組別間差異最大的詞

針對topic1 , topic6 進行分析，分析哪些詞彙 很常出現在topic 1，但很少出現在topic6的詞彙。

beta_spread <- news_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .0004 | topic6 > .0004) %>%
  mutate(log_ratio = log2(topic6 / topic1))

news_topic_ratio <- rbind(beta_spread %>% top_n(10,wt = log_ratio), beta_spread %>% top_n(-10, log_ratio)) %>%
  arrange(log_ratio)

news_topic_ratio %>% 
  ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
  geom_bar(stat="identity") + 
  xlab("Word")+
  coord_flip()

我們選用主題一跟主題六做分析，由圖可知左下角為主題一相較主題六常出現的字，有反深，核四，停建等字，這些字詞都是屬於跟深澳電廠有相關的話題，而主題一剛好就是新北市長選舉的主題，9月的時候蘇貞昌有接受深澳電廠附近居民的陳情，所以產生相關新聞

篩選後的LDA模型

移除所有出現在三篇文章以下的詞彙

reserved_word <- news %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > 3) %>% 
  unlist()

news_removed <- news %>% 
  filter( word %in% reserved_word)
news_dtm_removed <- news_removed %>% cast_dtm(artId, word, count)
# news_lda_removed <- LDA(news_dtm_removed, k = 6, control = list(seed = 1234))
# news_topics_removed <- tidy(news_lda_removed, matrix = "beta")
# save(news_lda_removed,news_topics_removed,file = "lda_removed.Rda")
load("lda1.Rda")

news_topics_removed %>%
  filter(! term %in% remove_words) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

我們把出現在文件次數小於三的字詞刪除，重新做一次LDA分析，發現有關韓國瑜的主題有三個，其他主題為台北市及新北市選舉，topic5、6的字詞有柯文哲，我們便想進一步分析，把topic5及topic6各字詞的beta值做log_ratio的比較。

篩選後組間差異詞

beta_spread_removed <- news_topics_removed %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic6 > .0004 | topic5 > .0004) %>%
  mutate(log_ratio = log2(topic5 / topic6))

news_removed_topic_ratio <- rbind(beta_spread_removed %>% top_n(10,wt = log_ratio), beta_spread_removed %>% top_n(-10, log_ratio)) %>% 
  arrange(log_ratio)

news_removed_topic_ratio %>% 
  ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
  geom_bar(stat="identity") + 
  coord_flip()

我們利用topic5、topic6做分析，可以發現在topic5相較於topic6的字詞中，有出現東亞、奧運、主辦權、解約等字詞，因為七月時中國以台灣進行「東京奧運正名活動」公投連署為由，在東亞奧會臨時理事會中發動表決，取消台中市明年東亞青運主辦權。而柯文哲認為北京政府不瞭解台灣社會。這個主題主要就是在談論這個新聞。

貳、比較不同媒體的政治傾向

時間篩選 2018-06-01-2018-11-31

load("data.rda")
#轉換資料格式
df$Date = df$Date %>% as.Date("%Y/%m/%d")
str(df) #總覽

## 'data.frame':    27007 obs. of  21 variables:
##  $ id                : chr  "風傳媒-2018/06/01 12:31:05-風傳媒" "風傳媒-2018/06/01 10:20:00-風傳媒" "風傳媒-2018/06/01 11:20:00-風傳媒" "風傳媒-2018/06/01 09:20:00-風傳媒" ...
##  $ Date              : Date, format: "2018-06-01" "2018-06-01" ...
##  $ Page_Name         : chr  "風傳媒" "風傳媒" "風傳媒" "風傳媒" ...
##  $ Page_ID           : chr  "186758878172497" "186758878172497" "186758878172497" "186758878172497" ...
##  $ Link              : logi  NA NA NA NA NA NA ...
##  $ Type              : chr  "link" "link" "link" "link" ...
##  $ All_Reaction_Count: int  1535 1102 457 443 402 324 290 270 253 261 ...
##  $ LIKE_COUNT        : int  1455 933 370 343 330 293 260 242 239 237 ...
##  $ WOW_COUNT         : int  7 10 4 5 5 0 2 2 3 1 ...
##  $ LOVE_COUNT        : int  26 4 1 1 0 3 1 0 2 6 ...
##  $ HAHA_COUNT        : int  31 124 37 72 56 24 21 19 8 16 ...
##  $ SAD_COUNT         : int  3 6 0 0 0 1 0 2 1 0 ...
##  $ ANGRY_COUNT       : int  13 25 45 22 11 3 6 5 0 1 ...
##  $ Comment_Count     : int  131 120 67 113 18 37 29 37 5 20 ...
##  $ Share_Count       : int  52 21 17 45 5 5 1 0 8 12 ...
##  $ Message           : chr  "「從今天起，臺北市政府正式脫離千億債務俱樂部。」\n\n台北市長柯文哲表示，他剛上任時，接下了1468億元的債務，現在"| __truncated__ "「因為黨已經沒有分裂的本錢。」\n\n呂秀蓮要向民進黨Bye Bye的舉動，讓總統蔡英文做出回應......\n\n<U+272A> 獨家內"| __truncated__ "給原住民「樂活基金」1500元，此舉讓民進黨議員洪健益質疑柯文哲，為何不能一樣發給一般百姓？\n\n對此，柯文哲說話了."| __truncated__ "新北市長參選人侯友宜一席「本土不是少數政黨的專利」言論，引發民進黨秘書長洪耀福不滿回應...\n\n洪耀福表示，中國國"| __truncated__ ...
##  $ Link_Title        : chr  "3年還520億！柯P霸氣宣布：北市脫離千億債務俱樂部-風傳媒" "不排除以無黨籍身分參選市長？呂秀蓮與民進黨分道揚鑣，蔡英文決定回應了-風傳媒" "「原民有票我們沒票嗎？」發放給原住民1500元挨批，柯文哲回應了…-風傳媒" "侯友宜稱「本土」不是少數政黨專利，洪耀福：先問問老闆要不要改名台灣國民黨？-風傳媒" ...
##  $ Link Description  : chr  "台北市長柯文哲今（1）日早上，在其個人臉書宣布：「從今天起，臺北市政府正式脫離千億債務俱樂部。」柯表示，市府現在"| __truncated__ "總統蔡英文今（31）日邀黨籍立委餐敘，會中談到前副總統呂秀蓮要與民進黨ByeBye，與會者轉述，總統將親自拜會呂秀蓮；"| __truncated__ "台北市議會31日繼續進行總質詢，民進黨籍市議員洪健益在質詢時，詢問市長柯文哲為何以「樂活基金」名目繼續發1500元給"| __truncated__ "年底新北市長選舉，國民黨候選人侯友宜稱「本土不是少數政黨的專利」，民進黨秘書長洪耀福今（31）日則反擊，指「我們"| __truncated__ ...
##  $ created_time      : num  7.55e-312 7.55e-312 7.55e-312 7.55e-312 7.55e-312 ...
##  $ date              : POSIXct, format: "2018-06-01" "2018-06-01" ...
##  $ Media             : chr  "風傳媒" "風傳媒" "風傳媒" "風傳媒" ...

篩選12家臉書新聞媒體做貼文分析，其中依據政黨傾向分類如下
惟其中政黨傾向有程度之別，將在後續分析

【媒體政治傾向分類】

相對中立

蘋果日報/蘋果日報即時新聞

偏民進黨

新頭殼newtalk
民視新聞/政經看民視民視看正晶
三立新聞/三立新聞網
風傳媒(中)
民報
自由時報

偏國民黨

China Times/中視新聞/中天新聞52家族/中時即影音
聯合新聞網/聯合報
TVBS 少康戰情室/TVBS 新聞
ETtoday筋斗雲/ETtoday新聞雲/東森新聞
Yahoo!奇摩新聞

【相似集團專頁合併】
而同一媒體集團，常有多個頻道分流
在此將各媒體不同粉絲專頁合併，並賦予資料標籤

# 讀入資料
load("csv_in_use.Rda")
# #新頭殼newtalk
# data_new_talk <- df %>% 
#   filter(Page_Name == "新頭殼newtalk" ) %>% 
#   mutate(media="新頭殼newtalk")
# 
# # China Times/中視新聞/中天新聞52家族/中時即影音
# data_china <- df %>% 
#   filter(Page_Name == "China Times" |Page_Name == "中視新聞" |Page_Name == "中天新聞52家族" |Page_Name == "中時即影音" ) %>% 
#   mutate(media="China Times/中視新聞/中天新聞52家族/中時即影音")
# 
# # 聯合新聞網/聯合報
# data_union <- df %>% 
#   filter(Page_Name == "聯合新聞網" |Page_Name == "聯合報"  ) %>% 
#   mutate(media="聯合新聞網/聯合報")
# 
# # ETtoday筋斗雲/ETtoday新聞雲/東森新聞
# data_et <- df %>% 
#   filter(Page_Name == "ETtoday筋斗雲" |  Page_Name == "ETtoday新聞雲" | Page_Name == "東森新聞") %>% 
#   mutate(media="ETtoday筋斗雲/ETtoday新聞雲/東森新聞")
# 
# # 三立新聞/三立新聞網
# data_trio <- df %>% 
#   filter(Page_Name == "三立新聞" | Page_Name == "三立新聞網")%>% 
#   mutate(media="三立新聞/三立新聞網") 
# 
# # 風傳媒
# data_wind <- df %>% 
#   filter(Page_Name == "風傳媒" ) %>% 
#   mutate(media="風傳媒")
# 
# # 民報
# data_people <- df %>% 
#   filter(Page_Name == "民報" ) %>% 
#   mutate(media="民報")
# 
# # 自由時報
# data_free <- df %>% 
#   filter(Page_Name == "自由時報" ) %>% 
#   mutate(media="自由時報")
# 
# # Yahoo!奇摩新聞
# data_yahoo <-df %>% 
#   filter(Page_Name == "Yahoo!奇摩新聞" ) %>% 
#   mutate(media="Yahoo!奇摩新聞")
# 
# # TVBS 少康戰情室/TVBS 新聞
# data_tvbs <- df %>% 
#   filter(Page_Name == "TVBS 少康戰情室" | Page_Name == "TVBS 新聞")%>% 
#   mutate(media="TVBS 少康戰情室/TVBS 新聞")
# 
# # 民視新聞/政經看民視 民視看正晶
# data_minsi <- df %>% 
#   filter( Page_Name == "政經看民視 民視看正晶" | Page_Name == "民視新聞") %>% 
#   mutate(media="民視新聞/政經看民視 民視看正晶")
# 
# # 蘋果日報/蘋果日報即時新聞
# data_apple <- df %>% 
#   filter(Page_Name == "蘋果日報 台灣" |  Page_Name == "蘋果日報即時新聞") %>% 
#   mutate(media="蘋果日報/蘋果日報即時新聞")

設定結巴斷詞引擎，並匯入停用字字典此處user.txt自訂，加入政治人物與政策相關字做斷詞如韓國瑜、蔡英文、柯p等人物名

jieba_tokenizer <- worker(user="dict/user.txt", stop_word =  "dict/stopwords-u8.txt")

# 設定斷詞function
policy_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

將標籤化後的資料合併，統一斷詞，並過濾數字、英文及"_"

data_media <-rbind(data_new_talk,data_china,data_union,data_et,data_trio,data_wind,data_people,data_free,data_yahoo,data_tvbs,data_apple,data_minsi)

media_token <- data_media %>% unnest_tokens(word, Message, token=policy_tokenizer) %>%
               select(Page_Name,word,Date,media)  %>%          
              filter(!str_detect(word, regex("[0-9a-zA-Z]")))%>%
              filter(!grepl('_',word))

各媒體情緒分布

接著進行各家媒體的情緒分析，使用liwc正負情緒字典

p <- read_file("liwc/positiveliwc.txt")
n <- read_file("liwc/negativeliwc.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)

p <- read_file("liwc/positiveliwc.txt")
n <- read_file("liwc/negativeliwc.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]

計算詞頻，如果詞彙只有一個字(如:請、年..)，或出現次數小於10，則不列入計算；最後先選出詞頻最高的20個關鍵字，由大到小排序。

tokens <-media_token

word_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(word_count, 20)

## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 韓國瑜 10171
##  2 柯文哲  8908
##  3 市長    7416
##  4 高雄    6395
##  5 下載    5306
##  6 選舉    4853
##  7 選戰    4439
##  8 陳其邁  4398
##  9 新聞    4299
## 10 姚文智  3879
## 11 民進黨  3751
## 12 侯友宜  3487
## 13 台北    3144
## 14 丁守中  2909
## 15 蘇貞昌  2774
## 16 國民黨  2556
## 17 合一    2341
## 18 蘋果    2229
## 19 直播    2115
## 20 政治    2099

接著依據民進黨、國民黨、相對中立，把斷詞後的數據標上blue、green、mid等政黨傾向，然後再次合併處理。

blue_tokens <- tokens %>%
  filter(media== "China Times/中視新聞/中天新聞52家族/中時即影音" | media=="聯合新聞網/聯合報" | media=="TVBS 少康戰情室/TVBS 新聞" | media=="ETtoday筋斗雲/ETtoday新聞雲/東森新聞" | media=="Yahoo!奇摩新聞")%>%
  mutate(policy="blue")

green_tokens<- tokens %>%
  filter(media== "新頭殼newtalk" | media=="民視新聞/政經看民視 民視看正晶" | media == "三立新聞/三立新聞網" | media=="風傳媒" | media=="民報" | media == "自由時報" )%>%
  mutate(policy="green")

mid_tokens<- tokens %>%
  filter(media== "蘋果日報/蘋果日報即時新聞")%>%
  mutate(policy="mid")

tokens <-rbind(mid_tokens,blue_tokens,green_tokens)

word_count <- tokens %>%
  select(Page_Name,Date,word,media,policy)

接著將斷好詞，也分類好的媒體資料，和情緒字典做inner_join，留下Page_Name,Date,sentiment,media,policy，此處情緒為正向減去負向情緒之總和。

下圖為全年依政黨傾向分類之媒體情緒圖（藍色為中立，紅色為親藍媒體，綠色為親綠媒體）

word_count <- tokens %>%
  inner_join(LIWC_ch) %>%
  count(Page_Name, Date, sentiment,media,policy) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>% 
  mutate(color = ifelse(sentiment < 0, "negative","positive"))

## Joining, by = "word"

ggplot(word_count, aes(Date, sentiment, fill = policy)) +
  geom_col(show.legend = FALSE)

因綜合圖表不好比較，故將各家媒體挑出；先以粉絲專頁為類別檢視，固定xy軸單位方便比較；可觀察到同一媒體集團內，不同管道仍有不同報載新聞模式（如雖同為蘋果日報，即時新聞的情緒字眼相較本頁就少許多）

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Page_Name, ncol = 4, scales = "fixed")

#沒有資料的日期將count設為0
plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=policy))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~Page_Name, ncol = 4, scales = "fixed")

plot_table

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~media, ncol = 4, scales = "fixed")

#沒有資料的日期將count設為0
plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=policy))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~media, ncol = 4, scales = "fixed")

plot_table

在各派系媒體分類中，欲求證是否在政治事件的表述上，有親綠媒體或親藍媒體較為情緒化表述的現象，比較後發現，各立場皆有相對情緒化與相對偏向理性陳述的媒體，加總比較後不相上下。親藍為china times、yahoo奇摩新聞、中視新聞情緒起伏較大；親綠則為民報、風傳媒、新頭殼、三立新聞等情緒變化較為明顯，對於相對政治中立的蘋果日報來說，也因報導內容偏好聳動標題與情緒性用詞，情緒曲線也呈現劇烈變動。其中東森新聞、聯合報在新聞表現上，則較少使用情緒性用語，而呈現相對平緩的情緒曲線。
總括來看，我們可以發現這些媒體的情緒皆因選具而逐漸升高。但因所選政治傾向媒體家數不同，下階段各選取一家作為分析

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~policy, ncol = 1, scales = "fixed")

plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~policy, ncol = 1, scales = "fixed")

plot_table

選出不同政治傾向中，情緒起伏最大的媒體
+ 民報
+ China Times
+ 蘋果日報即時新聞

select_word_count <- word_count %>%
  filter(Page_Name == "民報" | Page_Name == "蘋果日報 台灣" | Page_Name == "China Times")

ggplot(select_word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Page_Name, ncol = 1, scales = "fixed")

正負面情緒貢獻詞Contribution to sentiment

word_num <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

word_num %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

將不同政治傾向媒體分開檢視

  word_num_blue <- blue_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="blue")

  word_num_green <- green_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="green")


  word_num_mid <- mid_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="mid")

word_num_blue %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "blue Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

word_num_green %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "green Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

word_num_mid %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "mid Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

在整體情緒正負面貢獻詞中發現，親藍與親綠媒體的正負貢獻詞差異不大。
親藍媒體正向詞多為支持、希望、熱情，負向詞多為尷尬、激烈、緊張、無效；
而親綠媒體正向詞多為支持、分享、希望，負向詞多為批評、違法、落伍等；
相對中立的蘋果正向詞多為不錯、支持、希望，負向詞多為無效、譴責、激烈、抗議。
總括來看，三者結果皆為政治新聞常見詞彙，彼此間沒有太明顯的差異。　　

三種政黨傾向的tfidf計算

用全部的發言下去做好像有點不太公平，所以用各篇文章分開來算

word_num_all <-rbind(word_num_blue,word_num_green,word_num_mid)

total_words <- word_num_all %>% 
  group_by(policy) %>% 
  summarize(total = sum(sum))

word_num_all <- left_join(word_num_all, total_words) %>%
  bind_tf_idf(word, policy, sum)%>%
  select(-total) %>%
  arrange(desc(tf_idf))

## Joining, by = "policy"

word_num_all

## # A tibble: 5,794 x 6
##    word     sum policy      tf   idf  tf_idf
##    <chr>  <int> <chr>    <dbl> <dbl>   <dbl>
##  1 蘋果    2194 mid    0.0747  0.405 0.0303 
##  2 爭霸戰   601 mid    0.0205  1.10  0.0225 
##  3 精采     241 mid    0.00821 1.10  0.00902
##  4 好聞     240 mid    0.00817 1.10  0.00898
##  5 風雲    1039 blue   0.00745 1.10  0.00818
##  6 天下事   162 mid    0.00552 1.10  0.00606
##  7 觀點     919 green  0.00487 1.10  0.00535
##  8 資訊     305 mid    0.0104  0.405 0.00421
##  9 糯米     528 blue   0.00378 1.10  0.00416
## 10 江湖     517 blue   0.00370 1.10  0.00407
## # ... with 5,784 more rows

因為小編名、新聞廣告字，在各家媒體出現頻率頗高，加入停用字去除，檢視排名較高的tf_idf

mystopwords <- tibble(word = c("少康","戰情","點到","每晚","風雲","糯米","江湖","編肉","新東","談大","蝙蝠","豬頭皮","前線","蘋果","蘋果日報","快遞","盡在","精彩","好聞","天下事","拒當","魯窮宅","娜娜","鬼編","圖文","資訊","頭殼","風傳","新聞網","生活圈","速報","投稿","高畫質","文章","百里","文章","專屬","都在","這裡","優質","漏接","觀點","國會草知識","知識","歡迎","盡在","天下事","點到","每晚","少康","戰情","黑寡婦","即時","慘編","鄉編","八方","聊天室","蝙蝠","超夯","小丸子","安博","精彩","立即","專題","要聞","旁編","作者","客訴","金鋼","大安","林口","雷神","方便","分享","天天","媒體","編蝠","快點","編錯","精彩","那編","咖編","不斷更新","限量","對此","新鮮事","熱門話題","蝙蝠俠","最新報導","胖丁","東區","淡水","方念華","募集","玩樂","中視","老爹","剖析","丹利","官方","大小事","全民","全世界","本日","一手","參與","意見","魔形","孩童","指出","人物","公眾","貓女","專欄","影像","精彩","南勢角","盛竹","一事","快來","投書","萬磁王","士林","爭霸戰","精彩","海報","影創","郭子","關鍵時刻","給你","來點","前哨站","阿尼","就在","條紋","追蹤","史蒂芬","我要","日前","午間","大方","正鯿","鎖定","送票","周周","鎖定","小丑","蘆竹"))

word_num_all <- anti_join(word_num_all, mystopwords, by = "word")

word_num_all %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(policy) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = policy)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~policy, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by tf_idf

各家媒體詞彙的tf idf分析，在刪去小編與報紙名後，可以發現
（１）親藍媒體出現韓總、北農、藍天等，關注由韓國瑜帶起的韓流現象，與九合一大選綠地變藍天的態勢。

（２）而親綠媒體中，「北農」詞彙和韓國瑜在政治宣傳上，大力強調的個人政績有關，「市政、國家、執政」詞彙，推估因民進黨為執政黨，故和國家執策的相關報導較有關係，

（３）在相對中立的蘋果日報排名中，因本身偏向聳動標題、具高度話題性的新聞，詞彙相對激烈，「脫節」評論藍營或綠營的施政理念與民眾脫節，「侵台、移民」和兩岸關係相關，「鐵粉」和藍綠兩黨各自的強力擁護者的相關報導，並常用「動未條」形容政治人物的誇張新聞，如柯文哲和學姊「陪吃說」和上綜藝節目時發生的搞笑事件。而「火星」則是韓國瑜面對綠營質疑，中國勢力是否滲入高雄選舉的事件中，出現「要是韓國瑜民調6成、陳其邁4成就會說月球人來了，搞不好下次7比3就是火星人來了」發言所致。

叁、比較各政治傾向的媒體暨談論候選人（韓國瑜）文章的用詞

選前媒體的重要辭彙

# 讀入資料
load("data.rda")

# 先做斷句（以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句）
devotion_sentences <- strsplit(df$Message,"[。！；？!?;]")

# 將每句句子，與所屬的文章連結配對起來，整理成一個dataframe
# devotion_sentences <- data.frame(
#   id = rep(df$id,sapply(devotion_sentences, length)), 
#   sentence = unlist(devotion_sentences)) %>%
#   filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
# devotion_sentences$sentence <- as.character(devotion_sentences$sentence)

# 使用斷詞引擎，放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞，並計算各詞彙在各文章中出現的次數
# devotion_words <- devotion_sentences %>%
#   unnest_tokens(word, sentence, token=chi_tokenizer) %>%
#   filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
#   count(id, word, sort = TRUE)
# save(devotion_words,devotion_sentences,file = "devotion.Rda")
load("devotion.Rda")
devotion_words

## # A tibble: 373,494 x 3
##    id                                                word       n
##    <fct>                                             <chr>  <int>
##  1 民報-2018/07/13 19:01:00-民報                     工會      15
##  2 民報-2018/07/13 19:01:00-民報                     總工會    15
##  3 TVBS 少康戰情室-2018/06/01 20:00:03-TVBS          呂秀蓮    12
##  4 Yahoo!奇摩新聞-2018/11/02 13:59:57-Yahoo!奇摩新聞 台南      12
##  5 民報-2018/11/18 23:11:25-民報                     透明      12
##  6 中視新聞-2018/06/05 17:45:28-中時                 香蕉      11
##  7 中視新聞-2018/06/05 22:00:07-中時                 香蕉      11
##  8 民報-2018/10/14 00:50:35-民報                     台灣      11
##  9 民報-2018/11/18 23:11:25-民報                     革命      11
## 10 民報-2018/11/21 18:29:57-民報                     高雄      11
## # ... with 373,484 more rows

devotion_words <- separate(devotion_words, id ,c("Page_Name","Date","Media"), sep = "-",remove = FALSE)

# 接著為了要計算tfidf，我們先去計算每篇文章包含的詞數
total_words <- devotion_words %>% 
  group_by(id) %>% 
  summarize(total = sum(n))

# 合併 devotion_words（每個詞彙在每個文章中出現的次數）與total_words（每篇文章的詞數）
devotion_words <- left_join(devotion_words, total_words)

## Joining, by = "id"

devotion_words

## # A tibble: 373,494 x 7
##    id                     Page_Name   Date       Media    word      n total
##    <fct>                  <chr>       <chr>      <chr>    <chr> <int> <int>
##  1 民報-2018/07/13 19:01:0~ 民報        2018/07/1~ 民報     工會     15   159
##  2 民報-2018/07/13 19:01:0~ 民報        2018/07/1~ 民報     總工會~    15   159
##  3 TVBS 少康戰情室-2018/06/01~ TVBS 少康戰情室~ 2018/06/0~ TVBS     呂秀蓮~    12   525
##  4 Yahoo!奇摩新聞-2018/11/02~ Yahoo!奇摩新聞~ 2018/11/0~ Yahoo!奇~ 台南     12    86
##  5 民報-2018/11/18 23:11:2~ 民報        2018/11/1~ 民報     透明     12   153
##  6 中視新聞-2018/06/05 17:45~ 中視新聞    2018/06/0~ 中時     香蕉     11   177
##  7 中視新聞-2018/06/05 22:00~ 中視新聞    2018/06/0~ 中時     香蕉     11   177
##  8 民報-2018/10/14 00:50:3~ 民報        2018/10/1~ 民報     台灣     11    74
##  9 民報-2018/11/18 23:11:2~ 民報        2018/11/1~ 民報     革命     11   153
## 10 民報-2018/11/21 18:29:5~ 民報        2018/11/2~ 民報     高雄     11   126
## # ... with 373,484 more rows

# 以每篇文章爲單位，計算每個詞彙在的tf-idf值
devotion_words_tf_idf <- devotion_words %>%
  bind_tf_idf(word, id, n)
devotion_words_tf_idf

## # A tibble: 373,494 x 10
##    id         Page_Name  Date   Media word      n total     tf   idf tf_idf
##    <fct>      <chr>      <chr>  <chr> <chr> <int> <int>  <dbl> <dbl>  <dbl>
##  1 民報-2018/0~ 民報       2018/~ 民報  工會     15   159 0.0943  7.63  0.720
##  2 民報-2018/0~ 民報       2018/~ 民報  總工會~    15   159 0.0943  7.36  0.695
##  3 TVBS 少康戰情~ TVBS 少康戰情~ 2018/~ TVBS  呂秀蓮~    12   525 0.0229  5.74  0.131
##  4 Yahoo!奇摩新~ Yahoo!奇摩新~ 2018/~ Yaho~ 台南     12    86 0.140   4.41  0.615
##  5 民報-2018/1~ 民報       2018/~ 民報  透明     12   153 0.0784  6.64  0.521
##  6 中視新聞-2018~ 中視新聞   2018/~ 中時  香蕉     11   177 0.0621  7.11  0.442
##  7 中視新聞-2018~ 中視新聞   2018/~ 中時  香蕉     11   177 0.0621  7.11  0.442
##  8 民報-2018/1~ 民報       2018/~ 民報  台灣     11    74 0.149   3.17  0.471
##  9 民報-2018/1~ 民報       2018/~ 民報  革命     11   153 0.0719  7.71  0.554
## 10 民報-2018/1~ 民報       2018/~ 民報  高雄     11   126 0.0873  2.38  0.208
## # ... with 373,484 more rows

# 我們在上面算出每個詞在每篇文章的tfidf，接著我們去統整每篇文章tf-idf最大的十個詞，查看每個詞被選中的次數，來了解哪些詞在這段時間來說，對各大媒體來說是格外重要的！
devotion_words_tf_idf %>% 
  group_by(id) %>%
  top_n(10) %>%
  arrange(desc(id)) %>%
  ungroup() %>%
  count(word, sort=TRUE) %>%
  filter(n>500) %>%
  ggplot(aes(x=word,y=n,fill=""))+
  geom_bar(stat="identity")+
  labs(x = "word",y = "sum")+
  ggtitle("所有媒體的重要辭彙")+ 
  theme(legend.position='none')+
  theme(axis.text.x = element_text(angle = -60))

## Selecting by tf_idf

# # 分媒體來看
# devotion_words_tf_idf %>% 
#   group_by(id) %>%
#   top_n(10) %>%
#   arrange(desc(id)) %>%
#   ungroup() %>%
#   count(Media,word, sort=TRUE) %>%
#   filter(n>150) %>%
#   ggplot(aes(word,n,fill=Media))+
#   geom_bar(stat="identity")+
#   labs(x = "word",y = "sum")+
#   ggtitle("各家媒體的重要辭彙(n>150)")+ 
#   theme(legend.position='none')  +
#   facet_wrap(~Media,ncol = 2, scales = "free") +
#   coord_flip()

各家媒體的重要辭彙

概括來看：這27007篇中,可以發現討論度很高的前三大人物:韓國瑜、柯文哲、陳其邁；柯文哲本身就是網路聲量起家，所以比較特別的是韓國瑜這次突出的網路討論度，也連帶帶起陳其邁與高雄的討論度，我們發現這次高雄選戰完全是在以網路媒體話題來操作，在網路上的討論度超高。
細分來看：雖大致還是參選者人名，但我們仍可以發現，如tvbs，選前強調11/24有方念華跨平台開票直播，並設有選戰風雲的hashtag；新頭殼大多在講高雄跟新北選戰；三立跟自由時報相對較少提到韓國瑜，內容多為柯文哲；而也可以看出風傳媒多以投稿的方式在找題材。

尋找文章中出現韓國瑜的前後用詞

# 這邊使用 jiebar and ngrams 來尋找特定詞彙的前後5個詞彙
# ngram functiong設定, where n=11 (這邊一樣有停用詞跟辭典)
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
ngram_11 <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      ngram<- ngrams(tokens, 11)
      ngram <- lapply(ngram, paste, collapse = " ")
      unlist(ngram)
    }
  })
}
# 執行ngram_11進行分詞
devotion_ngram_11 <- df %>%
  select(id,Message) %>%
  unnest_tokens(ngram, Message, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))

# 將ngram拆成word1 ~ word11
ngrams_11_separated <- devotion_ngram_11 %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")

# 尋找 "韓國瑜" 和 "國瑜" 出現的前後五個詞彙
yu_five_words <- ngrams_11_separated %>%
  filter((word6=="韓國瑜"|word6=="國瑜"))

# 尋找 "韓國瑜" 和 "國瑜" 出現的前後五個詞彙常出現哪些的詞彙
yu_five_words_count <- yu_five_words %>%
  melt(id.vars = "id", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)

# 先來看這幾大媒體中有韓國瑜的提及比例
yu_five_words1 <- separate(yu_five_words, id ,c("Page_Name","Date","Media"), sep = "-",remove = FALSE)
yu_num =table(yu_five_words1$Page_Name) %>%sort %>% as.data.frame 
total_num =table(df$Page_Name) %>%sort %>% as.data.frame 
yu_total_num= inner_join(yu_num, total_num, by = c(Var1 = "Var1"))
yu_total_num = yu_total_num %>% mutate(prop = Freq.x / Freq.y) %>% arrange(desc(prop))
yu_total_num %>%
  ggplot(aes(x=Var1,y=prop,fill=""))+
  geom_bar(stat="identity")+
  labs(x = "Page_Name",y = "number of article")+
  ggtitle("各大媒體報韓國瑜比例")+ 
  theme(legend.position='none')+
  coord_flip()

# 視覺化
yu_five_words_count %>%
  arrange(desc(abs(n))) %>%
  head(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words near by 韓國瑜 (整體來看)") +
  ylab("Word count") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light"))

# 若分兩派來看
yu_five_words<- separate(yu_five_words, id ,c("Page_Name","Date","Media"), sep = "-",remove = FALSE)
yu_five_words_count1 <- yu_five_words %>%
  filter(Media=="中時"|Media=="聯合"|Media=="TVBS"|Media=="ETtoday東森"|Media=="Yahoo!奇摩新聞") %>%
  melt(id.vars = "id", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)

yu_five_words_count2 <- yu_five_words %>%
  filter(Media=="新頭殼newtalk"|Media=="民視新聞"|Media=="三立"|Media=="自由時報"|Media=="民報"|Media=="風傳媒") %>%
  melt(id.vars = "id", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
yu_five_words_count = bind_rows(yu_five_words_count1 %>% 
                                  mutate(name="偏藍"),
                                yu_five_words_count2%>% 
                                  mutate(name="偏綠"))
# 視覺化
yu_five_words_count%>%
  arrange(desc(abs(n))) %>%
  head(25) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = name)) +
  geom_col(show.legend = FALSE) +
  xlab("Words near by 韓國瑜 (媒體分派) ") +
  ylab("Word count") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light")) +
  facet_wrap(~name, ncol = 2, scales = "free")

# 更細分每個媒體的話
# 挑掉太普遍的詞還有不重要的詞: e.g.魚子醬,阿泥是中時小編(專發韓新聞的)
stop = c("高雄","高雄市","陳其邁","候選人","市長","國民黨","民進黨","小編","韓國瑜","阿泥","魚子醬","大安","金鋼","少康","戰情","點到","每晚","風雲","糯米","江湖","編肉","新東","談大","蝙蝠","豬頭皮","前線","蘋果","蘋果日報","快遞","盡在","精彩","好聞","天下事","拒當","魯窮宅","娜娜","鬼編","圖文","資訊","頭殼","風傳","新聞網","生活圈","速報","投稿","高畫質","文章","百里","文章","專屬","都在","這裡","優質","漏接","觀點","國會草知識","知識","歡迎","盡在","天下事","點到","每晚","少康","戰情","黑寡婦","即時","慘編","鄉編","八方","聊天室","蝙蝠","超夯","小丸子","安博","精彩","立即","專題","要聞","旁編","作者","客訴","金鋼","大安","林口","雷神","方便","分享","天天","媒體","編蝠","快點","編錯","精彩","那編","咖編","不斷更新","限量","對此","新鮮事","熱門話題","蝙蝠俠","最新報導","胖丁","東區","淡水","方念華","募集","玩樂","中視","老爹","剖析","丹利","官方","大小事","全民","全世界","本日","一手","參與","意見","魔形","孩童","指出","人物","公眾","貓女","專欄","影像","精彩","南勢角","盛竹","一事","快來","投書","萬磁王","士林","爭霸戰","精彩","海報","影創","郭子","關鍵時刻","給你","來點","前哨站","阿尼","就在","條紋","追蹤","史蒂芬","我要","日前","午間","大方","正鯿","鎖定","送票","周周","鎖定","小丑","蘆竹")

yu_five_words_count1 <- yu_five_words1 %>%
  group_by(Media)%>%
  melt(id.vars = "Media", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop), nchar(word)>1) %>%
  count(Media,word, sort = TRUE)

# yu_five_words_count1 %>%
#   arrange(desc(abs(n))) %>%
#   group_by(Media) %>% 
#   top_n(5) %>%
#   mutate(word = reorder(word, n)) %>%
#   ungroup()  %>%
#   ggplot(aes(word, n ,fill =Media)) +
#   geom_col(show.legend = FALSE) +
#   xlab("Words near by \"韓國瑜\" & \"國瑜\"") +
#   ylab("Word count") +
#   coord_flip()+ 
#   facet_wrap(~Media, ncol = 2, scales = "free") +
#   theme(text = element_text(family = "Heiti TC Light"))

Words near by 韓國瑜(各媒體分)

（１）整體來看：
我們大概看出前後都是選戰有關的內容，以高雄、陳其邁、市長等為附近的內容用語。
（２）當我們分成偏藍及偏綠的媒體來看時：
可以看到偏綠的媒體通常以一般平述的方式講韓國瑜，而偏藍的媒體會去講他的造勢，並提到民調、直播等比較突出且具優勢的特色，或者報導有人抹黑韓國瑜等（因為韓當時開直播會說自己被抹黑、抹紅、抹黃之類的），來做一個相較綠媒有宣傳感的報導。
（３）更細分每個媒體來看時：
看出每一家媒體對於韓國瑜的態度，東森跟中時都推造勢（因為造勢韓國瑜表現較好）；三立、民視則是主打辯論（因為辯論陳其邁表現較好）；tvbs會講有人在抹黑；自由時報會用韓粉這個詞來嘲諷；民報會刻意提中國來講韓國瑜的立場；蘋果就比較愛報八卦一點的，如邱議瑩「陪睡說」；新頭殼就比較再強調聲量、民調等數據的東西。

尋找文章中出現國民黨和民進黨的前後用詞

# 尋找 "國民黨"或"民進黨"出現的前後五個詞彙
five_words <- ngrams_11_separated %>%
  filter((word6=="國民黨"|word6=="民進黨"))
five_words<- separate(five_words, id ,c("Page_Name","Date","Media"), sep = "-",remove = FALSE)

# 尋找 "國民黨" 出現的前後五個詞彙常出現哪些的詞彙
five_words_count1 <- five_words %>%
  filter(Media=="中時"|Media=="聯合"|Media=="自由時報"|Media=="TVBS"|Media=="ETtoday東森"|Media=="Yahoo!奇摩新聞") %>%
  filter(word6=="國民黨")%>%
  melt(id.vars = "id", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
five_words_count2 <- five_words %>%
  filter(Media=="新頭殼newtalk"|Media=="民視新聞"|Media=="三立"|Media=="民報"|Media=="風傳媒") %>%
  filter(word6=="國民黨")%>%
  melt(id.vars = "id", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
five_words_count=
  bind_rows(five_words_count1 %>% 
            mutate(name="偏藍"),
          five_words_count2%>% 
            mutate(name="偏綠"))

# 尋找 "民進黨" 出現的前後五個詞彙常出現哪些的詞彙
five_words_count3 <- five_words %>%
  filter(Media=="中時"|Media=="聯合"|Media=="自由時報"|Media=="TVBS"|Media=="ETtoday東森"|Media=="Yahoo!奇摩新聞") %>%
  filter(word6=="民進黨")%>%
  melt(id.vars = "id", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
five_words_count4 <- five_words %>%
  filter(Media=="新頭殼newtalk"|Media=="民視新聞"|Media=="三立"|Media=="民報"|Media=="風傳媒") %>%
  filter(word6=="民進黨")%>%
  melt(id.vars = "id", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
five_words_count5=
  bind_rows(five_words_count3 %>% 
              mutate(name="偏藍"),
            five_words_count4%>% 
              mutate(name="偏綠"))

# # 視覺化
# five_words_count %>%
#   arrange(desc(abs(n))) %>%
#   head(20) %>%
#   mutate(word = reorder(word, n)) %>%
#   ggplot(aes(word, n, fill = name)) +
#   geom_col(show.legend = FALSE) +
#   xlab("Words near by \"國民黨\" ") +
#   ylab("Word count") +
#   coord_flip()+ 
#   theme(text = element_text(family = "Heiti TC Light")) +
#   facet_wrap(~name, ncol = 2, scales = "free")
# 
# five_words_count5 %>%
#   arrange(desc(abs(n))) %>%
#   head(20) %>%
#   mutate(word = reorder(word, n)) %>%
#   ggplot(aes(word, n, fill = name)) +
#   geom_col(show.legend = FALSE) +
#   xlab("Words near by \"民進黨\" ") +
#   ylab("Word count") +
#   coord_flip()+ 
#   theme(text = element_text(family = "Heiti TC Light")) +
#   facet_wrap(~name, ncol = 2, scales = "free")

民進黨詞前後五字統整國民黨詞前後五字統整

可以發現不管是在報導國民黨或民進黨，偏藍的媒體都專注於報導韓國瑜，而偏綠的媒體則是會提到該黨的不同候選人；可以推測，此時的藍營在網路聲量上，是非常＂主打＂韓國瑜的（反而不提及其他候選人）。

各家媒體報導韓國瑜時的情緒分析

# 載入stop words字典
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]
stop_words <- data.frame(word = stop_words)
colnames(stop_words) = c("word")
stop_words <- read_file("dict/stop_word.txt")
stop_words <- strsplit(stop_words, "[\r]")[[1]]

# 載入negation words字典
negation_words <- read_file("dict/negation_words.txt")
negation_words <- strsplit(negation_words, "[\r]")[[1]]
negation_words<- data.frame(word = negation_words)
colnames(negation_words) = c("nega_word")
negation_words <- read_file("dict/negation_words.txt")
negation_words <- strsplit(negation_words, "[\r]")[[1]]

# 把stop words中的negation words移掉
stop_words <- stop_words[!(stop_words %in% negation_words)]

# 載入斷詞字典
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]
use_dict<- data.frame(word = use_dict)
colnames(use_dict) = c("nega_word")
use_dict <- read_file("dict/use_dict.txt")
use_dict <- strsplit(use_dict, "[\r]")[[1]]

# 載入liwc情緒字典
# p <- read_file("liwc/positiveliwc.txt")
# n <- read_file("liwc/negativeliwc.txt")
# positive <- strsplit(p, "[,]")[[1]]
# negative <- strsplit(n, "[,]")[[1]]
# positive <- data.frame(word = positive, sentiments = "positive")
# negative <- data.frame(word = negative, sentiemtns = "negative")
# colnames(negative) = c("word","sentiment")
# colnames(positive) = c("word","sentiment")
# LIWC_ch <- rbind(positive, negative)
# 
# p <- read_file("liwc/positiveliwc.txt")
# n <- read_file("liwc/negativeliwc.txt")
# positive <- strsplit(p, "[,]")[[1]]
# negative <- strsplit(n, "[,]")[[1]]
# 

# 這裏不加入stop word字典(清掉的話會影響bigram結果)
jieba_tokenizer = worker()

# 使用還願字典重新斷詞，把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(use_dict,negation_words))

## [1] TRUE

# unnest_tokens 使用的bigram分詞函數，並執行bigram分詞
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(unlist(tokens), 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}
devotion_bigram <- df %>%
  unnest_tokens(bigram,Message, token = jieba_bigram)

# 將bigram拆成word1和word2，並將包含英文字母或和數字的詞彙清除
bigrams_separated <- devotion_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# 並選出word2爲情緒詞的bigram
devotion_sentiment_bigrams <- bigrams_separated %>%
  filter(!word1 %in% stop_words) %>%
  filter(!word2 %in% stop_words) %>%
  inner_join(LIWC_ch, by = c(word2 = "word"))   

# 畫出有韓國瑜文章的情緒走勢圖
# 從剛剛做的"yu_five_words"中篩出哪幾篇是有出現韓國瑜文章
# 篩掉同一篇文章
yu_five_words1=yu_five_words[!duplicated(yu_five_words$id), ] 
# right_join
devotion_sentiment_bigrams1 = inner_join(devotion_sentiment_bigrams, yu_five_words1, by = c(id = "id"))
devotion_sentiment_bigrams1=separate(devotion_sentiment_bigrams1, id ,c("Page_Name","Date","Media"), sep = "-",remove = FALSE)
devotion_sentiment_bigrams1$date= as.Date(devotion_sentiment_bigrams1$Date,format="%Y/%m/%d") 

# 選出word2中，有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予： 情緒標籤爲"positive"、情緒值爲  1
# 如果是負面詞彙則賦予： 情緒標籤爲"negative"、情緒值爲 -1
devotion_sentiment_bigrams1 <- devotion_sentiment_bigrams1 %>%
  select(id,date, word1.x, word2.x) %>%
  mutate(sentiment=ifelse(word2.x %in% positive,1,-1), sentiment_tag=ifelse(word2.x %in% positive, "positive", "negative"))

# 生成一個時間段中的 日期和情緒標籤的所有可能組合

all_dates <- 
  expand.grid(seq(as.Date(min(devotion_sentiment_bigrams1$date)), as.Date(max(devotion_sentiment_bigrams1$date)), by="day"), c("positive", "negative"))
names(all_dates) <- c("date", "sentiment")

# 計算我們資料集中 每日的情緒值
sentiment_plot_data <- devotion_sentiment_bigrams1 %>%
  group_by(id,date,sentiment_tag,sentiment) %>%
  summarise(count=n())  

# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值，會出現NA
# 我們用0取代NA
sentiment_plot_data <- all_dates %>% 
  merge(sentiment_plot_data,by.x=c('date', "sentiment"),by.y=c('date', "sentiment_tag"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0)) 
  
# 時間區段中，每日的情緒值
sentiment_plot_data=sentiment_plot_data %>%
  mutate(sentiment.y = replace_na(sentiment.y, 0)) %>%
  mutate(count1 = sentiment.y * count)

sentiment_plot_data = separate(sentiment_plot_data, id ,c("Page_Name","Date","Media"), sep = "-",remove = FALSE)

# 畫圖
all_plot <- sentiment_plot_data %>%
  ggplot(aes(date,count1,fill=sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ Media, scales = "fixed") +
  scale_x_date(labels = date_format("%m-%d")) 
# all_plot

# 反轉前面是否定詞且後面爲情緒詞彙的組合
devotion_sentiment_bigrams %>%
  filter(word1 %in% negation_words) %>%
  count(word1, word2, sort = TRUE)

## # A tibble: 102 x 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 不    喜歡     35
##  2 不    開心     35
##  3 不    可以     34
##  4 不    願意     31
##  5 不    相信     28
##  6 不    希望     24
##  7 不    支持     23
##  8 不    清楚     23
##  9 不    容易     22
## 10 不    尊重     19
## # ... with 92 more rows

devotion_sentiment_bigrams_negated <- devotion_sentiment_bigrams1 %>%
  mutate(sentiment=ifelse(word1.x %in% negation_words, (-1)*sentiment, sentiment)) %>%
  mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))

# 計算我們資料集中每日的情緒值
negated_sentiment_plot_data <- devotion_sentiment_bigrams_negated %>%
  group_by(id,date,sentiment_tag,sentiment) %>%
  summarise(count=n())  

# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值，會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>% 
  merge(negated_sentiment_plot_data,by.x=c('date', "sentiment"),by.y=c('date', "sentiment_tag"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))

# 最後把圖畫出來
negated_sentiment_plot_data=negated_sentiment_plot_data %>%
  mutate(sentiment.y = replace_na(sentiment.y, 0)) %>%
  mutate(count1 = sentiment.y * count)
negated_sentiment_plot_data= separate(negated_sentiment_plot_data, id ,c("Page_Name","Date","Media"), sep = "-",remove = FALSE)

# negated_sentiment_plot_data %>%
#   ggplot(aes(date,count1,fill=sentiment)) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~ Media, scales = "fixed") +
#   scale_x_date(labels = date_format("%m-%d"))

反轉前
各家媒體的韓國瑜文章情緒
反轉後: 各家媒體的韓國瑜文章情緒(反轉後)

因為報導中可能會有嘲諷意味等內容，所以在正負面情緒上反轉效果較差，而評估可能會有錯誤(e.g.民報：從小跟班變救世主，國民黨走向滅亡？中時：韓國瑜被抹黑)；
但我們可以發現，有兩家媒體（中時、民報）在有關韓國瑜的文章中，情緒用詞相較於其他媒體多，因此讀者在識讀上也需要注意；也可以跟原本媒體總情緒分析做比較，發現中時、民報以外的媒體，跟平時報導比起來，情緒用詞也相對減少，可能比較適合讀者選前參考閱讀。

挑出兩極端（立場不同）媒體，計算韓國瑜相關文章的詞頻

# 有出現韓國瑜的文章直接算詞頻
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })}

# 選出有出現韓國瑜的文章,並整理資料
devotion_sentences2 = inner_join(devotion_sentences, yu_five_words1, by = c(id = "id"))
devotion_sentences2$word1=NULL
devotion_sentences2$word2=NULL
devotion_sentences2$word3=NULL
devotion_sentences2$word4=NULL
devotion_sentences2$word5=NULL
devotion_sentences2$word6=NULL
devotion_sentences2$word7=NULL
devotion_sentences2$word8=NULL
devotion_sentences2$word9=NULL
devotion_sentences2$word10=NULL
devotion_sentences2$word11=NULL

devotion_sentences2 = separate(devotion_sentences2, id ,c("Page_Name","Date","Media"), sep = "-",remove = FALSE)

# # 斷詞後篩出中時的資料並畫出文字雲
# tokens<- devotion_sentences2 %>%
#   unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
#   filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
#   filter(Media=="中時") %>%
#   filter(nchar(.$word)>1) %>%
#   group_by(word) %>% 
#   summarise(sum = n()) %>% 
#   filter(sum>10) %>%
#   arrange(desc(sum))
# tokens %>% wordcloud2(minSize = 3)
# 
# # 斷詞後篩出民報的資料並畫出文字雲
# tokens1<- devotion_sentences2 %>%
#   unnest_tokens(word, sentence, token=chi_tokenizer) %>%
#   filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
#   filter(Media=="民報") %>%
#   filter(nchar(.$word)>1) %>%
#   group_by(word) %>%
#   summarise(sum = n()) %>%
#   filter(sum>10) %>%
#   arrange(desc(sum))
# tokens1 %>% wordcloud2(minSize = 3)

中時與韓國瑜文章民報與韓國瑜文章

其實這兩張圖大致上都是高雄、陳其邁、國民黨、民進黨等詞；比較不一樣的在於：
1.中時強調造勢、民調；民報強調政策、政見。
2.中時的用詞是兩岸；民報的用詞是台灣、中國。

結語

資料總結：
1. 選前議題：
  每個選區都會有不同的議題，像是台北市選舉，就有許多跟柯P器官案相關的議題，還有新北市選舉也有跟深澳電廠，核四相關的議題出現。
2. 網路聲量的主角們：
  這次的選戰的確延燒到網路，並且也能發現主角是韓國瑜、柯文哲、陳其邁。
3. 各家媒體選前態度：
  即便是同一媒體集團，各家媒體的情緒性報導，仍會因不同管道的受眾而有所差異，其中又以中國時報、民報、風傳媒、蘋果日報、新頭殼等特別突出，且隨選舉月份接近顯著。不論親綠、親藍或相對中立媒體，對政治事件皆有情緒性報導現象
4. 媒體主打對象：
  偏藍媒體是在網路聲量上，是非常主打韓國瑜的；偏綠媒體則都會提及每個候選人。
5. 每一家媒體對於韓國瑜的態度表現：
  會運用跟自己立場比較像的詞彙來做形容這個人（e.g.造勢,韓粉,中國）。而對韓國瑜立場非常鮮明的媒體(民報與中時)：他們在有關韓國瑜的文章中，情緒相對豐富，在描述韓國瑜的內容上，火藥味看起來較為濃厚。
思考方向：
1. 雖比較難得出選票與討論內容的相關程度，我們仍可以發現若媒體使用主打的方式，也許能夠連帶帶動整體的討論度；也許在未來2020總統大選若注意不要黨內分散聲量，以主打的方式，也許會有特別的效果。
2. 各家媒體在報導上都還是會使用比較情緒上的用詞，尤其到選前更加嚴重，因此我們若要運用媒體來決定自己的票該給誰，參考的文章應盡量不要受選前一個月內發佈的文章影響（此時情緒都格外高昂）。

社群媒體分析_期中報告_第九組

陳柏愷劉岱宜張文瑜

2019年4月22日

各家媒體九合一選前表現分析

動機與目的：

零、整理並篩選資料

安裝需要的packages

讀入套件

壹、LDA資料模型分析

斷句

Tokenization 與載入字典

加入識別文章的id,將資料轉換為Document Term Matrix (DTM)

建立LDA模型-尋找Topic的代表字

\(\phi\) Matrix

組別間差異最大的詞

篩選後的LDA模型

篩選後組間差異詞

貳、比較不同媒體的政治傾向

相對中立

偏民進黨

偏國民黨

各媒體情緒分布

正負面情緒貢獻詞Contribution to sentiment

三種政黨傾向的tfidf計算

叁、比較各政治傾向的媒體暨談論候選人（韓國瑜）文章的用詞

選前媒體的重要辭彙

尋找文章中出現韓國瑜的前後用詞

尋找文章中出現國民黨和民進黨的前後用詞

各家媒體報導韓國瑜時的情緒分析

挑出兩極端（立場不同）媒體，計算韓國瑜相關文章的詞頻

結語

社群媒體分析_期中報告_第九組

陳柏愷 劉岱宜 張文瑜

2019年4月22日

各家媒體九合一選前表現分析

動機與目的：

零、整理並篩選資料

安裝需要的packages

讀入套件

壹、LDA資料模型分析

斷句

Tokenization 與載入字典

加入識別文章的id,將資料轉換為Document Term Matrix (DTM)

建立LDA模型-尋找Topic的代表字

\(\phi\) Matrix

組別間差異最大的詞

篩選後的LDA模型

篩選後組間差異詞

貳、比較不同媒體的政治傾向

相對中立

偏民進黨

偏國民黨

各媒體情緒分布

正負面情緒貢獻詞Contribution to sentiment

三種政黨傾向的tfidf計算

叁、比較各政治傾向的媒體暨談論候選人（韓國瑜）文章的用詞

選前媒體的重要辭彙

尋找文章中出現韓國瑜的前後用詞

尋找文章中出現國民黨和民進黨的前後用詞

各家媒體報導韓國瑜時的情緒分析

挑出兩極端（立場不同）媒體，計算韓國瑜相關文章的詞頻

結語

陳柏愷劉岱宜張文瑜