比較台灣不同媒體的政治傾向

時間篩選：2018-06-01-2018-11-31

篩選12家常見臉書新聞媒體做貼文分析，其中依據政黨傾向分類如下，惟其中政黨傾向有程度之別，將在後續分析。

【媒體政治傾向分類】

相對中立

蘋果日報/蘋果日報即時新聞

偏民進黨

新頭殼newtalk
民視新聞/政經看民視民視看正晶
三立新聞/三立新聞網
風傳媒(中)
民報

偏國民黨

China Times/中視新聞/中天新聞52家族/中時即影音
聯合新聞網/聯合報
自由時報
TVBS 少康戰情室/TVBS 新聞
ETtoday筋斗雲/ETtoday新聞雲/東森新聞
Yahoo!奇摩新聞

packages = c("readr","tm", "data.table", "dplyr", "stringr", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot", "htmlwidgets")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(servr)

## Loading required package: servr

#設定中文編碼，並匯入套件
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")

## [1] ""

packages = c("dplyr","ggplot2", "data.table", "scales", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr","bigmemory","corrplot","ggpubr","topicmodels","jiebaRD")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

require(scales)

## Loading required package: scales

require(dplyr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(ggplot2)

## Loading required package: ggplot2

require(data.table)

## Loading required package: data.table

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

require(scales)

library(dplyr)
library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(corrplot)

## corrplot 0.84 loaded

library(wordcloud2)
library(wordcloud)

## Loading required package: RColorBrewer

library(ggpubr)

## Loading required package: magrittr

library(topicmodels)
library(tidytext)
library(jiebaRD)
library(jiebaR)

library(stringr)
require(reshape2)

## Loading required package: reshape2

## 
## Attaching package: 'reshape2'

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

require(tidyr)

## Loading required package: tidyr

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

## The following object is masked from 'package:magrittr':
## 
##     extract

require(readr)

## Loading required package: readr

## 
## Attaching package: 'readr'

## The following object is masked from 'package:scales':
## 
##     col_factor

讀取各月份資料

csv06 <- fread("C:/Users/user/Desktop/R/2019/201806_data.csv", encoding = "UTF-8")
csv07 <- fread("C:/Users/user/Desktop/R/2019/201807_data.csv", encoding = "UTF-8")
csv08 <- fread("C:/Users/user/Desktop/R/2019/201808_data.csv", encoding = "UTF-8")
csv09 <- fread("C:/Users/user/Desktop/R/2019/201809_data.csv", encoding = "UTF-8")
csv10 <- fread("C:/Users/user/Desktop/R/2019/201810_data.csv", encoding = "UTF-8")
csv11 <- fread("C:/Users/user/Desktop/R/2019/201811_data.csv", encoding = "UTF-8")

csv_in_use=rbind(csv06,csv07,csv08,csv09,csv10,csv11)

#轉換資料格式
csv_in_use$Date = csv_in_use$Date %>% as.Date("%Y/%m/%d")
str(csv_in_use) #總覽

## Classes 'data.table' and 'data.frame':   229495 obs. of  18 variables:
##  $ Date              : Date, format: "2018-06-01" "2018-06-01" ...
##  $ Page_Name         : chr  "柯文哲" "東森新聞" "即新聞" "侯友宜" ...
##  $ Page_ID           :integer64 136845026417486 124616330906800 525781104253994 261813197541354 354487984641189 339483189800311 240170506141170 102884532662 ... 
##  $ Link              : logi  NA NA NA NA NA NA ...
##  $ Type              : chr  "photo" "link" "video" "photo" ...
##  $ All_Reaction_Count: int  365168 52932 9572 9063 8584 6963 7098 6145 5843 5222 ...
##  $ LIKE_COUNT        : int  349049 51380 9155 8912 8038 6902 6201 5768 5333 4923 ...
##  $ WOW_COUNT         : int  1008 563 39 1 31 5 82 24 6 82 ...
##  $ LOVE_COUNT        : int  13223 784 264 130 17 42 32 126 415 42 ...
##  $ HAHA_COUNT        : int  1732 145 83 20 50 11 756 203 83 171 ...
##  $ SAD_COUNT         : int  99 21 6 0 128 1 10 1 1 3 ...
##  $ ANGRY_COUNT       : int  57 39 25 0 320 2 17 23 5 1 ...
##  $ Comment_Count     : int  13879 1481 294 366 232 290 334 149 2174 118 ...
##  $ Share_Count       : int  25297 1010 875 399 122 171 2317 198 706 418 ...
##  $ Message           : chr  "從今天起，臺北市政府正式脫離千億債務俱樂部。\n\n我剛上任時，接下了1468億元的債務，過了3年半還了差不多520億元，"| __truncated__ "3年還清520億！ 柯文哲霸氣宣告：北市脫離千億債務…15萬人淚讚\n#魚干編：柯P就是狂～棒棒<U+0001F44D><U+0001F44D>\n\"| __truncated__ "柯P 3年還520億!!!(#安東)\n【相關報導】\n3年還520億！柯文哲創地方首長最高還債紀錄　13萬人淚讚\nhttps://www.ettod"| __truncated__ "今天南下高雄，鄉親告訴我「很久沒這麼熱了」，說的是天氣，也是人氣。\n\n朋友們的熱情和鼓勵，我都收到了，心暖暖的"| __truncated__ ...
##  $ Link_Title        : chr  "" "3年還清520億！ 柯文哲霸氣宣告：北市脫離千億債務…15萬人淚讚" "上任3年還520億債務萬人淚讚　柯文哲嚴格控管預算：做該做的" "Photos from 侯友宜's post" ...
##  $ Link Description  : chr  "" "台北市長柯文哲上任以來還債成績顯著，目前已還債累計達520億，今（1）日在臉書霸氣宣布，北市政府正式脫離千億債務俱"| __truncated__ "" "" ...
##  $ created_time      :integer64 1527821093000 1527843600000 1527852216000 1527854173000 1527841143000 1527825601000 1527864886000 1527851700000 ... 
##  - attr(*, ".internal.selfref")=<externalptr>

【相似集團專頁合併】而同一媒體集團，常有多個頻道分流在此將各媒體不同粉絲專頁合併，並賦予資料標籤

設定結巴斷詞引擎，並匯入停用字字典此處user.txt自訂，加入政治人物與政策相關字做斷詞如韓國瑜、蔡英文、柯p等人物名。

將標籤化後的資料合併，統一斷詞，並過濾數字、英文及“_"

data_media <-rbind(data_new_talk,data_china,data_union,data_et,data_trio,data_wind,data_people,data_free,data_yahoo,data_tvbs,data_apple,data_minsi)

media_token <- data_media %>% unnest_tokens(word, Message, token=policy_tokenizer) %>%
               select(Page_Name,word,Date,media)  %>%          
              filter(!str_detect(word, regex("[0-9a-zA-Z]")))%>%
              filter(!grepl('_',word))

各媒體情緒分布

接著進行各家媒體的情緒分析，使用liwc正負情緒字典

library(stringr)
require(reshape2)
require(tidyr)
require(readr)

p <- read_file("C:/Users/user/Desktop/social_media/mid_project_media/dict/liwc/positive.txt")
n <- read_file("C:/Users/user/Desktop/social_media/mid_project_media/dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
head(LIWC_ch, 5)

##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive

計算詞頻，如果詞彙只有一個字(如:請、年..)，或出現次數小於10，則不列入計算；最後先選出詞頻最高的20個關鍵字，由大到小排序。

tokens <-media_token

word_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(word_count, 20)

## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 韓國瑜 10171
##  2 柯文哲  8908
##  3 市長    7416
##  4 高雄    6395
##  5 下載    5306
##  6 選舉    4853
##  7 選戰    4439
##  8 陳其邁  4398
##  9 新聞    4299
## 10 姚文智  3879
## 11 民進黨  3751
## 12 侯友宜  3487
## 13 台北    3144
## 14 丁守中  2909
## 15 蘇貞昌  2774
## 16 國民黨  2556
## 17 合一    2341
## 18 蘋果    2229
## 19 直播    2115
## 20 政治    2099

接著依據民進黨、國民黨、相對中立，把斷詞後的數據標上blue、green、mid等政黨傾向，然後再次合併處理。

接著將斷好詞，也分類好的媒體資料，和情緒字典做inner_join，留下Page_Name,Date,sentiment,media,policy，此處情緒為正向減去負向情緒之總和。

全年依政黨傾向分類之媒體情緒圖

下圖為全年依政黨傾向分類之媒體情緒圖（藍色為中立，紅色為親藍媒體，綠色為親綠媒體）

word_count <- tokens %>%
  inner_join(LIWC_ch) %>%
  count(Page_Name, Date, sentiment,media,policy) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>% 
  mutate(color = ifelse(sentiment < 0, "negative","positive"))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

ggplot(word_count, aes(Date, sentiment, fill = policy)) +
  geom_col(show.legend = FALSE)

各家媒體情緒圖

因綜合圖表不好比較，故將各家媒體挑出；先以粉絲專頁為類別檢視，固定xy軸單位方便比較；可觀察到同一媒體集團內，不同管道仍有不同報載新聞模式（如雖同為蘋果日報，即時新聞的情緒字眼相較本頁就少許多）

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Page_Name, ncol = 4, scales = "fixed")

#沒有資料的日期將count設為0
plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=policy))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~Page_Name, ncol = 4, scales = "fixed")

plot_table

接著檢視合併同一集團媒體後，所出現的情緒分布可發現相對其他媒體中天新聞播送總情緒，較其他媒體高出許多而聯合報系情緒圖就相對平緩許多

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~media, ncol = 4, scales = "fixed")

#沒有資料的日期將count設為0
plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=policy))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~media, ncol = 4, scales = "fixed")

plot_table

在各派系媒體分類中，欲求證是否在政治事件的表述上，有親綠媒體或親藍媒體較為情緒化表述的現象，比較後發現，各立場皆有相對情緒化與相對偏向理性陳述的媒體，加總比較後不相上下。親藍為china >times、yahoo奇摩新聞、中視新聞情緒起伏較大；親綠則為民報、風傳媒、新頭殼、三立新聞等情緒變化較為明顯， >對於相對政治中立的蘋果日報來說，也因報導內容偏好聳動標題與情緒性用詞，情緒曲線也呈現劇烈變動。其中東森新聞、聯合報在新聞表現上，則較少使用情緒性用語，而呈現相對平緩的情緒曲線。總括來看，我們可以發現這些媒體的情緒皆因選具而逐漸升高。但因所選政治傾向媒體家數不同，下階段各選取一家作為分析

接著將各家媒體依據藍、綠、中立合併，觀察其情緒分布。

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~policy, ncol = 1, scales = "fixed")

plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~policy, ncol = 1, scales = "fixed")

plot_table

個別媒體情緒圖

選出不同政治傾向中，情緒起伏最大的媒體 +民報 +China Times +蘋果日報即時新聞

select_word_count <- word_count %>%
  filter(Page_Name == "民報" | Page_Name == "蘋果日報 台灣" | Page_Name == "China Times")

ggplot(select_word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Page_Name, ncol = 1, scales = "fixed")

正負面情緒貢獻詞彙Contribution to sentiment

word_num <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

word_num %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

將不同政治傾向媒體分開檢視

  word_num_blue <- blue_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="blue")

  word_num_green <- green_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="green")


  word_num_mid <- mid_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="mid")

word_num_blue %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "blue Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

word_num_green %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "green Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

word_num_mid %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "mid Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

在整體情緒正負面貢獻詞中發現，親藍與親綠媒體的正負貢獻詞差異不大。親藍媒體正向詞多為支持、希望、熱情，負向詞多為尷尬、激烈、緊張、無效；而親綠媒體正向詞多為支持、分享、希望，負向詞多為批評、違法、落伍等；相對中立的蘋果正向詞多為不錯、支持、希望，負向詞多為無效、譴責、激烈、抗議。總括來看，三者結果皆為政治新聞常見詞彙，彼此間沒有太明顯的差異。　

tf_idf

用全部的發言下去做好像有點不太公平，所以用各篇文章分開來算

word_num_all <-rbind(word_num_blue,word_num_green,word_num_mid)

total_words <- word_num_all %>% 
  group_by(policy) %>% 
  summarize(total = sum(sum))

word_num_all <- left_join(word_num_all, total_words) %>%
  bind_tf_idf(word, policy, sum)%>%
  select(-total) %>%
  arrange(desc(tf_idf))

## Joining, by = "policy"

word_num_all

## # A tibble: 5,819 x 6
##    word     sum policy      tf   idf  tf_idf
##    <chr>  <int> <chr>    <dbl> <dbl>   <dbl>
##  1 蘋果    2194 mid    0.0747  0.405 0.0303 
##  2 爭霸戰   601 mid    0.0205  1.10  0.0225 
##  3 精采     241 mid    0.00821 1.10  0.00902
##  4 好聞     240 mid    0.00817 1.10  0.00898
##  5 風雲    1039 blue   0.00656 1.10  0.00720
##  6 天下事   162 mid    0.00552 1.10  0.00606
##  7 觀點     918 green  0.00543 1.10  0.00596
##  8 新聞網   678 green  0.00401 1.10  0.00440
##  9 資訊     305 mid    0.0104  0.405 0.00421
## 10 速報     609 green  0.00360 1.10  0.00395
## # ... with 5,809 more rows

因為小編名、新聞廣告字，在各家媒體出現頻率頗高，加入停用字去除，檢視排名較高的tf_idf

mystopwords <- tibble(word = c("少康","戰情","點到","每晚","風雲","糯米","江湖","編肉","新東","談大","蝙蝠","豬頭皮","前線","蘋果","蘋果日報","快遞","盡在","精彩","好聞","天下事","拒當","魯窮宅","娜娜","鬼編","圖文","資訊","頭殼","風傳","新聞網","生活圈","速報","投稿","高畫質","文章","百里","文章","專屬","都在","這裡","優質","漏接","觀點","國會草知識","知識","歡迎","盡在","天下事","點到","每晚","少康","戰情","黑寡婦","即時","慘編","鄉編","八方","聊天室","蝙蝠","超夯","小丸子","安博","精彩","立即","專題","要聞","旁編","作者","客訴","金鋼","大安","林口","雷神","方便","分享","天天","媒體","編蝠","快點","編錯","精彩","那編","咖編","不斷更新","限量","對此","新鮮事","熱門話題","蝙蝠俠","最新報導","胖丁","東區","淡水","方念華","募集","玩樂","中視","老爹","剖析","丹利","官方","大小事","全民","全世界","本日","一手","參與","意見","魔形","孩童","指出","人物","公眾","貓女","專欄","影像","精彩","南勢角","盛竹","一事","快來","投書","萬磁王","士林","爭霸戰","精彩","海報","影創","郭子","關鍵時刻","給你","來點","前哨站","阿尼","就在","條紋","追蹤","史蒂芬","我要","日前","午間","大方","正鯿","鎖定","送票","周周","鎖定","小丑","蘆竹"))

word_num_all <- anti_join(word_num_all, mystopwords, by = "word")

word_num_all %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(policy) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = policy)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~policy, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by tf_idf

各家媒體詞彙的tf idf分析，在刪去小編與報紙名後，可以發現（１）親藍媒體出現韓總、北農、藍天等，關注由韓國瑜帶起的韓流現象，與九合一大選綠地變藍天的態勢。

（２）而親綠媒體中，「北農」詞彙和韓國瑜在政治宣傳上，大力強調的個人政績有關，「市政、國家、執政」詞彙，推估因民進黨為執政黨，故和國家執策的相關報導較有關係，

（３）在相對中立的蘋果日報排名中，因本身偏向聳動標題、具高度話題性的新聞，詞彙相對激烈，「脫節」評論藍營或綠營的施政理念與民眾脫節，「侵台、移民」和兩岸關係相關，「鐵粉」和藍綠兩黨各自的強力擁護者的相關報導，並常用「動未條」形容政治人物的誇張新聞，如柯文哲和學姊「陪吃說」和上綜藝節目時發生的搞笑事件。而「火星」則是韓國瑜面對綠營質疑，中國勢力是否滲入高雄選舉的事件中，出現「要是韓國瑜民調6成、陳其邁4成就會說月球人來了，搞不好下次7比3就是火星人來了」發言所致。

臉書不同媒體的政治傾向比較_以2018年6-11月為例

劉岱宜