貳、比較不同媒體的政治傾向

時間篩選 2018-06-01-2018-11-31

篩選12家臉書新聞媒體做貼文分析,其中依據政黨傾向分類如下 惟其中政黨傾向有程度之別,將在後續分析

【媒體政治傾向分類】

相對中立

  • 蘋果日報/蘋果日報即時新聞

偏民進黨

  • 新頭殼newtalk
  • 民視新聞/政經看民視 民視看正晶
  • 三立新聞/三立新聞網
  • 風傳媒(中)
  • 民報

偏國民黨

  • China Times/中視新聞/中天新聞52家族/中時即影音
  • 聯合新聞網/聯合報
  • 自由時報
  • TVBS 少康戰情室/TVBS 新聞
  • ETtoday筋斗雲/ETtoday新聞雲/東森新聞
  • Yahoo!奇摩新聞

【相似集團專頁合併】 而同一媒體集團,常有多個頻道分流 在此將各媒體不同粉絲專頁合併,並賦予資料標籤

讀入套件

require(readr)
require(tm)
## Warning: package 'tm' was built under R version 3.5.3
## Warning: package 'NLP' was built under R version 3.5.2
require(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
require(dplyr)
require(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.3
require(ggplot2)
require(tidyr)
require(quanteda)
## Warning: package 'quanteda' was built under R version 3.5.3
require(Matrix)
require(slam)
## Warning: package 'slam' was built under R version 3.5.2
require(Rtsne)
## Warning: package 'Rtsne' was built under R version 3.5.3
require(randomcoloR)
## Warning: package 'randomcoloR' was built under R version 3.5.3
require(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.3
require(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.5.3
library(stringi)
library(stringr)
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.5.3
## Warning: package 'jiebaRD' was built under R version 3.5.3
library(widyr)
## Warning: package 'widyr' was built under R version 3.5.3
library(scales)

匯入資料 此處採用政治所向Qsearch購買之2018年臉書發文資料 因6月到11月間,為選舉敏感時期,故挑選其作為分析區間

csv06 <- fread("data/201806_data.csv", encoding = "UTF-8")
## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.
csv07 <- fread("data/201807_data.csv", encoding = "UTF-8")
## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.
csv08 <- fread("data/201808_data.csv", encoding = "UTF-8")
## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.
csv09 <- fread("data/201809_data.csv", encoding = "UTF-8")
## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.
csv10 <- fread("data/201810_data.csv", encoding = "UTF-8")
## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.
csv11 <- fread("data/201811_data.csv", encoding = "UTF-8")
## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.
csv_in_use=rbind(csv06,csv07,csv08,csv09,csv10,csv11)

#轉換資料格式
csv_in_use$Date = csv_in_use$Date %>% as.Date("%Y/%m/%d")
str(csv_in_use) #總覽
## Classes 'data.table' and 'data.frame':   229495 obs. of  18 variables:
##  $ Date              : Date, format: "2018-06-01" "2018-06-01" ...
##  $ Page_Name         : chr  "柯文哲" "東森新聞" "即新聞" "侯友宜" ...
##  $ Page_ID           : 'integer64' num  6.76e-310 6.16e-310 2.60e-309 1.29e-309 1.75e-309 ...
##  $ Link              : logi  NA NA NA NA NA NA ...
##  $ Type              : chr  "photo" "link" "video" "photo" ...
##  $ All_Reaction_Count: int  365168 52932 9572 9063 8584 6963 7098 6145 5843 5222 ...
##  $ LIKE_COUNT        : int  349049 51380 9155 8912 8038 6902 6201 5768 5333 4923 ...
##  $ WOW_COUNT         : int  1008 563 39 1 31 5 82 24 6 82 ...
##  $ LOVE_COUNT        : int  13223 784 264 130 17 42 32 126 415 42 ...
##  $ HAHA_COUNT        : int  1732 145 83 20 50 11 756 203 83 171 ...
##  $ SAD_COUNT         : int  99 21 6 0 128 1 10 1 1 3 ...
##  $ ANGRY_COUNT       : int  57 39 25 0 320 2 17 23 5 1 ...
##  $ Comment_Count     : int  13879 1481 294 366 232 290 334 149 2174 118 ...
##  $ Share_Count       : int  25297 1010 875 399 122 171 2317 198 706 418 ...
##  $ Message           : chr  "從今天起,臺北市政府正式脫離千億債務俱樂部。\n\n我剛上任時,接下了1468億元的債務,過了3年半還了差不多520億元,"| __truncated__ "3年還清520億! 柯文哲霸氣宣告:北市脫離千億債務…15萬人淚讚\n#魚干編:柯P就是狂~棒棒<U+0001F44D><U+0001F44D>\n\"| __truncated__ "柯P 3年還520億!!!(#安東)\n【相關報導】\n3年還520億!柯文哲創地方首長最高還債紀錄 13萬人淚讚\nhttps://www.ettod"| __truncated__ "今天南下高雄,鄉親告訴我「很久沒這麼熱了」,說的是天氣,也是人氣。\n\n朋友們的熱情和鼓勵,我都收到了,心暖暖的"| __truncated__ ...
##  $ Link_Title        : chr  "" "3年還清520億! 柯文哲霸氣宣告:北市脫離千億債務…15萬人淚讚" "上任3年還520億債務萬人淚讚 柯文哲嚴格控管預算:做該做的" "Photos from 侯友宜's post" ...
##  $ Link Description  : chr  "" "台北市長柯文哲上任以來還債成績顯著,目前已還債累計達520億,今(1)日在臉書霸氣宣布,北市政府正式脫離千億債務俱"| __truncated__ "" "" ...
##  $ created_time      : 'integer64' num  7.55e-312 7.55e-312 7.55e-312 7.55e-312 7.55e-312 ...
##  - attr(*, ".internal.selfref")=<externalptr>
#新頭殼newtalk
data_new_talk <- csv_in_use %>% 
  filter(Page_Name == "新頭殼newtalk" ) %>% 
  mutate(media="新頭殼newtalk")

# China Times/中視新聞/中天新聞52家族/中時即影音
data_china <- csv_in_use %>% 
  filter(Page_Name == "China Times" |Page_Name == "中視新聞" |Page_Name == "中天新聞52家族" |Page_Name == "中時即影音" ) %>% 
  mutate(media="China Times/中視新聞/中天新聞52家族/中時即影音")

# 聯合新聞網/聯合報
data_union <- csv_in_use %>% 
  filter(Page_Name == "聯合新聞網" |Page_Name == "聯合報"  ) %>% 
  mutate(media="聯合新聞網/聯合報")

# ETtoday筋斗雲/ETtoday新聞雲/東森新聞
data_et <- csv_in_use %>% 
  filter(Page_Name == "ETtoday筋斗雲" |  Page_Name == "ETtoday新聞雲" | Page_Name == "東森新聞") %>% 
  mutate(media="ETtoday筋斗雲/ETtoday新聞雲/東森新聞")

# 三立新聞/三立新聞網
data_trio <- csv_in_use %>% 
  filter(Page_Name == "三立新聞" | Page_Name == "三立新聞網")%>% 
  mutate(media="三立新聞/三立新聞網") 

# 風傳媒
data_wind <- csv_in_use %>% 
  filter(Page_Name == "風傳媒" ) %>% 
  mutate(media="風傳媒")

# 民報
data_people <- csv_in_use %>% 
  filter(Page_Name == "民報" ) %>% 
  mutate(media="民報")

# 自由時報
data_free <- csv_in_use %>% 
  filter(Page_Name == "自由時報" ) %>% 
  mutate(media="自由時報")

# Yahoo!奇摩新聞
data_yahoo <- csv_in_use %>% 
  filter(Page_Name == "Yahoo!奇摩新聞" ) %>% 
  mutate(media="Yahoo!奇摩新聞")

# TVBS 少康戰情室/TVBS 新聞
data_tvbs <- csv_in_use %>% 
  filter(Page_Name == "TVBS 少康戰情室" | Page_Name == "TVBS 新聞")%>% 
  mutate(media="TVBS 少康戰情室/TVBS 新聞")

# 民視新聞/政經看民視 民視看正晶
data_minsi <- csv_in_use %>% 
  filter( Page_Name == "政經看民視 民視看正晶" | Page_Name == "民視新聞") %>% 
  mutate(media="民視新聞/政經看民視 民視看正晶")

# 蘋果日報/蘋果日報即時新聞
data_apple <- csv_in_use %>% 
  filter(Page_Name == "蘋果日報 台灣" |  Page_Name == "蘋果日報即時新聞") %>% 
  mutate(media="蘋果日報/蘋果日報即時新聞")

設定結巴斷詞引擎,並匯入停用字字典 此處user.txt自訂,加入政治人物與政策相關字做斷詞 如韓國瑜、蔡英文、柯p等人物名

jieba_tokenizer <- worker(user="dict/user.txt", stop_word =  "dict/stopwords-u8.txt")

# 設定斷詞function
policy_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

將標籤化後的資料合併,統一斷詞,並過濾數字、英文及“_"

data_media <-rbind(data_new_talk,data_china,data_union,data_et,data_trio,data_wind,data_people,data_free,data_yahoo,data_tvbs,data_apple,data_minsi)

media_token <- data_media %>% unnest_tokens(word, Message, token=policy_tokenizer) %>%
               select(Page_Name,word,Date,media)  %>%          
              filter(!str_detect(word, regex("[0-9a-zA-Z]")))%>%
              filter(!grepl('_',word))

情緒分析

接著進行各家媒體的情緒分析 使用liwc正負情緒字典

library(stringr)
require(reshape2)
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
require(tidyr)
require(readr)

p <- read_file("liwc/positive.txt")
n <- read_file("liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
head(LIWC_ch, 5)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive

計算詞頻,如果詞彙只有一個字(如:請、年..) 或出現次數小於10,則不列入計算 最後先選出詞頻最高的20個關鍵字,由大到小排序

tokens <-media_token

word_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(word_count, 20)
## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 韓國瑜 10171
##  2 柯文哲  8908
##  3 市長    7416
##  4 高雄    6395
##  5 下載    5306
##  6 選舉    4853
##  7 選戰    4439
##  8 陳其邁  4398
##  9 姚文智  3879
## 10 民進黨  3751
## 11 侯友宜  3487
## 12 台北    3144
## 13 丁守中  2909
## 14 蘇貞昌  2774
## 15 國民黨  2556
## 16 合一    2341
## 17 直播    2115
## 18 政治    2099
## 19 候選人  1853
## 20 台中    1843

接著依據民進黨、國民黨、相對中立 把斷詞後的數據標上blue、green、mid等政黨傾向 然後再次合併處理

blue_tokens <- tokens %>%
  filter(media== "China Times/中視新聞/中天新聞52家族/中時即影音" | media=="聯合新聞網/聯合報" | media == "自由時報" | media=="TVBS 少康戰情室/TVBS 新聞" | media=="ETtoday筋斗雲/ETtoday新聞雲/東森新聞" | media=="Yahoo!奇摩新聞")%>%
  mutate(policy="blue")

green_tokens<- tokens %>%
  filter(media== "新頭殼newtalk" | media=="民視新聞/政經看民視 民視看正晶" | media == "三立新聞/三立新聞網" | media=="風傳媒" | media=="民報" )%>%
  mutate(policy="green")

mid_tokens<- tokens %>%
  filter(media== "蘋果日報/蘋果日報即時新聞")%>%
  mutate(policy="mid")

tokens <-rbind(mid_tokens,blue_tokens,green_tokens)

word_count <- tokens %>%
  select(Page_Name,Date,word,media,policy) 

接著將斷好詞,也分類好的媒體資料 和情緒字典做inner_join 留下Page_Name, Date, sentiment,media,policy 此處情緒為正向減去負向情緒之總和

下圖為全年依政黨傾向分類之媒體情緒圖 藍色為中立,紅色為親藍媒體,綠色為親綠媒體

word_count <- tokens %>%
  inner_join(LIWC_ch) %>%
  count(Page_Name, Date, sentiment,media,policy) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>% 
  mutate(color = ifelse(sentiment < 0, "negative","positive")) 
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
ggplot(word_count, aes(Date, sentiment, fill = policy)) +
  geom_col(show.legend = FALSE) 

因綜合圖表不好比較,故將各家媒體挑出 先以粉絲專頁為類別檢視,固定xy軸單位方便比較 可觀查到同一媒體集團內,不同管道仍有不同報載新聞模式

如雖同為蘋果日報, 即時新聞的情緒字眼相較本頁就少許多

而因為所抓取資料及為政治相關報導 各家媒體情緒也隨11月選舉到來而逐漸升高

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Page_Name, ncol = 4, scales = "fixed")

#沒有資料的日期將count設為0
plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=policy))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~Page_Name, ncol = 4, scales = "fixed")

plot_table 

接著檢視合併同一集團媒體後,所出現的情緒分布 可發現相對其他媒體中天新聞播送總情緒,較其他媒體高出許多 而聯合報系情緒圖就相對平緩許多

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~media, ncol = 4, scales = "fixed")

#沒有資料的日期將count設為0
plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=policy))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~media, ncol = 4, scales = "fixed")

plot_table 

接著將各家媒體依據藍、綠、中立合併,觀察其情緒分布 可發現情緒皆因選具而逐漸升高 但因所選政治傾向媒體家數不同,下階段各選取一家作為分析

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~policy, ncol = 1, scales = "fixed")

plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~policy, ncol = 1, scales = "fixed")

plot_table 

選出不同政治傾向中,情緒起伏最大的媒體 +民報 +China Times +蘋果日報即時新聞

select_word_count <- word_count %>%
  filter(Page_Name == "民報" | Page_Name == "蘋果日報 台灣" | Page_Name == "China Times")

ggplot(select_word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Page_Name, ncol = 1, scales = "fixed")

Contribution to sentiment

整體情緒正負面貢獻詞彙分析

word_num <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

word_num %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

將不同政治傾向媒體分開檢視

  word_num_blue <- blue_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="blue")

  word_num_green <- green_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="green")


  word_num_mid <- mid_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="mid")

word_num_blue %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "blue Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

word_num_green %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "green Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

word_num_mid %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "mid Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

tf_idf 用全部的發言下去做好像有點不太公平 用各篇文章分開來算

word_num_all <-rbind(word_num_blue,word_num_green,word_num_mid)

total_words <- word_num_all %>% 
  group_by(policy) %>% 
  summarize(total = sum(sum))

word_num_all <- left_join(word_num_all, total_words) %>%
  bind_tf_idf(word, policy, sum)%>%
  select(-total) %>%
  arrange(desc(tf_idf))
## Joining, by = "policy"
word_num_all
## # A tibble: 5,771 x 6
##    word     sum policy      tf   idf  tf_idf
##    <chr>  <int> <chr>    <dbl> <dbl>   <dbl>
##  1 精采     241 mid    0.00992  1.10 0.0109 
##  2 知識      81 mid    0.00333  1.10 0.00366
##  3 脫節      80 mid    0.00329  1.10 0.00362
##  4 拒當      77 mid    0.00317  1.10 0.00348
##  5 方念華   454 blue   0.00300  1.10 0.00330
##  6 新東     448 blue   0.00296  1.10 0.00326
##  7 募集     383 blue   0.00253  1.10 0.00278
##  8 談大     378 blue   0.00250  1.10 0.00275
##  9 編蝠     364 blue   0.00241  1.10 0.00265
## 10 超夯     271 blue   0.00179  1.10 0.00197
## # ... with 5,761 more rows

因為小編名、新聞廣告字,在各家媒體出現頻率頗高 加入停用字去除,檢視排名較高的tf_idf

mystopwords <- tibble(word = c("少康","戰情","點到","每晚","風雲","糯米","江湖","編肉","新東","談大","蝙蝠","豬頭皮","前線","蘋果","蘋果日報","快遞","盡在","精彩","好聞","天下事","拒當","魯窮宅","娜娜","鬼編","圖文","資訊","頭殼","風傳","新聞網","生活圈","速報","投稿","高畫質","文章","百里","文章","專屬","都在","這裡","優質","漏接","觀點","國會草知識","知識","歡迎","盡在","天下事","點到","每晚","少康","戰情","黑寡婦","即時","慘編","鄉編","八方","聊天室","蝙蝠","超夯","小丸子","安博","精彩","立即","專題","要聞","旁編","作者","客訴","金鋼","大安","林口","雷神","方便","分享","天天","媒體","編蝠","快點","編錯","精彩","那編","咖編","不斷更新","限量","對此","新鮮事","熱門話題","蝙蝠俠","最新報導","胖丁","東區","淡水"))

word_num_all <- anti_join(word_num_all, mystopwords, by = "word")

word_num_all %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(policy) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = policy)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~policy, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf

+選戰策略與群眾脫節 /小英執政與民意有脫節

+拒當台南川普/拒當女柯p

+體檢 交通/青年政策

+火星 劉世芳批大陸勢力滲透 韓國瑜酸:下次就換火星人來了