貳、比較不同媒體的政治傾向

時間篩選 2018-06-01-2018-11-31

篩選12家臉書新聞媒體做貼文分析，其中依據政黨傾向分類如下惟其中政黨傾向有程度之別，將在後續分析

【媒體政治傾向分類】

相對中立

蘋果日報/蘋果日報即時新聞

偏民進黨

新頭殼newtalk
民視新聞/政經看民視民視看正晶
三立新聞/三立新聞網
風傳媒(中)
民報

偏國民黨

China Times/中視新聞/中天新聞52家族/中時即影音
聯合新聞網/聯合報
自由時報
TVBS 少康戰情室/TVBS 新聞
ETtoday筋斗雲/ETtoday新聞雲/東森新聞
Yahoo!奇摩新聞

【相似集團專頁合併】而同一媒體集團，常有多個頻道分流在此將各媒體不同粉絲專頁合併，並賦予資料標籤

讀入套件

require(readr)
require(tm)

## Warning: package 'tm' was built under R version 3.5.3

## Warning: package 'NLP' was built under R version 3.5.2

require(data.table)

## Warning: package 'data.table' was built under R version 3.5.3

require(dplyr)
require(tidytext)

## Warning: package 'tidytext' was built under R version 3.5.3

require(ggplot2)
require(tidyr)
require(quanteda)

## Warning: package 'quanteda' was built under R version 3.5.3

require(Matrix)
require(slam)

## Warning: package 'slam' was built under R version 3.5.2

require(Rtsne)

## Warning: package 'Rtsne' was built under R version 3.5.3

require(randomcoloR)

## Warning: package 'randomcoloR' was built under R version 3.5.3

require(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.5.3

require(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 3.5.3

library(stringi)
library(stringr)
library(jiebaR)

## Warning: package 'jiebaR' was built under R version 3.5.3

## Warning: package 'jiebaRD' was built under R version 3.5.3

library(widyr)

## Warning: package 'widyr' was built under R version 3.5.3

library(scales)

匯入資料此處採用政治所向Qsearch購買之2018年臉書發文資料因6月到11月間，為選舉敏感時期，故挑選其作為分析區間

csv06 <- fread("data/201806_data.csv", encoding = "UTF-8")

## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.

csv07 <- fread("data/201807_data.csv", encoding = "UTF-8")

## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.

csv08 <- fread("data/201808_data.csv", encoding = "UTF-8")

## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.

csv09 <- fread("data/201809_data.csv", encoding = "UTF-8")

## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.

csv10 <- fread("data/201810_data.csv", encoding = "UTF-8")

## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.

csv11 <- fread("data/201811_data.csv", encoding = "UTF-8")

## Warning in require_bit64(): Some columns are type 'integer64' but
## package bit64 is not installed. Those columns will print as strange
## looking floating point data. There is no need to reload the data. Simply
## install.packages('bit64') to obtain the integer64 print method and print
## the data again.

csv_in_use=rbind(csv06,csv07,csv08,csv09,csv10,csv11)

#轉換資料格式
csv_in_use$Date = csv_in_use$Date %>% as.Date("%Y/%m/%d")
str(csv_in_use) #總覽

## Classes 'data.table' and 'data.frame':   229495 obs. of  18 variables:
##  $ Date              : Date, format: "2018-06-01" "2018-06-01" ...
##  $ Page_Name         : chr  "柯文哲" "東森新聞" "即新聞" "侯友宜" ...
##  $ Page_ID           : 'integer64' num  6.76e-310 6.16e-310 2.60e-309 1.29e-309 1.75e-309 ...
##  $ Link              : logi  NA NA NA NA NA NA ...
##  $ Type              : chr  "photo" "link" "video" "photo" ...
##  $ All_Reaction_Count: int  365168 52932 9572 9063 8584 6963 7098 6145 5843 5222 ...
##  $ LIKE_COUNT        : int  349049 51380 9155 8912 8038 6902 6201 5768 5333 4923 ...
##  $ WOW_COUNT         : int  1008 563 39 1 31 5 82 24 6 82 ...
##  $ LOVE_COUNT        : int  13223 784 264 130 17 42 32 126 415 42 ...
##  $ HAHA_COUNT        : int  1732 145 83 20 50 11 756 203 83 171 ...
##  $ SAD_COUNT         : int  99 21 6 0 128 1 10 1 1 3 ...
##  $ ANGRY_COUNT       : int  57 39 25 0 320 2 17 23 5 1 ...
##  $ Comment_Count     : int  13879 1481 294 366 232 290 334 149 2174 118 ...
##  $ Share_Count       : int  25297 1010 875 399 122 171 2317 198 706 418 ...
##  $ Message           : chr  "從今天起，臺北市政府正式脫離千億債務俱樂部。\n\n我剛上任時，接下了1468億元的債務，過了3年半還了差不多520億元，"| __truncated__ "3年還清520億！ 柯文哲霸氣宣告：北市脫離千億債務…15萬人淚讚\n#魚干編：柯P就是狂～棒棒<U+0001F44D><U+0001F44D>\n\"| __truncated__ "柯P 3年還520億!!!(#安東)\n【相關報導】\n3年還520億！柯文哲創地方首長最高還債紀錄　13萬人淚讚\nhttps://www.ettod"| __truncated__ "今天南下高雄，鄉親告訴我「很久沒這麼熱了」，說的是天氣，也是人氣。\n\n朋友們的熱情和鼓勵，我都收到了，心暖暖的"| __truncated__ ...
##  $ Link_Title        : chr  "" "3年還清520億！ 柯文哲霸氣宣告：北市脫離千億債務…15萬人淚讚" "上任3年還520億債務萬人淚讚　柯文哲嚴格控管預算：做該做的" "Photos from 侯友宜's post" ...
##  $ Link Description  : chr  "" "台北市長柯文哲上任以來還債成績顯著，目前已還債累計達520億，今（1）日在臉書霸氣宣布，北市政府正式脫離千億債務俱"| __truncated__ "" "" ...
##  $ created_time      : 'integer64' num  7.55e-312 7.55e-312 7.55e-312 7.55e-312 7.55e-312 ...
##  - attr(*, ".internal.selfref")=<externalptr>

#新頭殼newtalk
data_new_talk <- csv_in_use %>% 
  filter(Page_Name == "新頭殼newtalk" ) %>% 
  mutate(media="新頭殼newtalk")

# China Times/中視新聞/中天新聞52家族/中時即影音
data_china <- csv_in_use %>% 
  filter(Page_Name == "China Times" |Page_Name == "中視新聞" |Page_Name == "中天新聞52家族" |Page_Name == "中時即影音" ) %>% 
  mutate(media="China Times/中視新聞/中天新聞52家族/中時即影音")

# 聯合新聞網/聯合報
data_union <- csv_in_use %>% 
  filter(Page_Name == "聯合新聞網" |Page_Name == "聯合報"  ) %>% 
  mutate(media="聯合新聞網/聯合報")

# ETtoday筋斗雲/ETtoday新聞雲/東森新聞
data_et <- csv_in_use %>% 
  filter(Page_Name == "ETtoday筋斗雲" |  Page_Name == "ETtoday新聞雲" | Page_Name == "東森新聞") %>% 
  mutate(media="ETtoday筋斗雲/ETtoday新聞雲/東森新聞")

# 三立新聞/三立新聞網
data_trio <- csv_in_use %>% 
  filter(Page_Name == "三立新聞" | Page_Name == "三立新聞網")%>% 
  mutate(media="三立新聞/三立新聞網") 

# 風傳媒
data_wind <- csv_in_use %>% 
  filter(Page_Name == "風傳媒" ) %>% 
  mutate(media="風傳媒")

# 民報
data_people <- csv_in_use %>% 
  filter(Page_Name == "民報" ) %>% 
  mutate(media="民報")

# 自由時報
data_free <- csv_in_use %>% 
  filter(Page_Name == "自由時報" ) %>% 
  mutate(media="自由時報")

# Yahoo!奇摩新聞
data_yahoo <- csv_in_use %>% 
  filter(Page_Name == "Yahoo!奇摩新聞" ) %>% 
  mutate(media="Yahoo!奇摩新聞")

# TVBS 少康戰情室/TVBS 新聞
data_tvbs <- csv_in_use %>% 
  filter(Page_Name == "TVBS 少康戰情室" | Page_Name == "TVBS 新聞")%>% 
  mutate(media="TVBS 少康戰情室/TVBS 新聞")

# 民視新聞/政經看民視 民視看正晶
data_minsi <- csv_in_use %>% 
  filter( Page_Name == "政經看民視 民視看正晶" | Page_Name == "民視新聞") %>% 
  mutate(media="民視新聞/政經看民視 民視看正晶")

# 蘋果日報/蘋果日報即時新聞
data_apple <- csv_in_use %>% 
  filter(Page_Name == "蘋果日報 台灣" |  Page_Name == "蘋果日報即時新聞") %>% 
  mutate(media="蘋果日報/蘋果日報即時新聞")

設定結巴斷詞引擎，並匯入停用字字典此處user.txt自訂，加入政治人物與政策相關字做斷詞如韓國瑜、蔡英文、柯p等人物名

jieba_tokenizer <- worker(user="dict/user.txt", stop_word =  "dict/stopwords-u8.txt")

# 設定斷詞function
policy_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

將標籤化後的資料合併，統一斷詞，並過濾數字、英文及“_"

data_media <-rbind(data_new_talk,data_china,data_union,data_et,data_trio,data_wind,data_people,data_free,data_yahoo,data_tvbs,data_apple,data_minsi)

media_token <- data_media %>% unnest_tokens(word, Message, token=policy_tokenizer) %>%
               select(Page_Name,word,Date,media)  %>%          
              filter(!str_detect(word, regex("[0-9a-zA-Z]")))%>%
              filter(!grepl('_',word))

情緒分析

接著進行各家媒體的情緒分析使用liwc正負情緒字典

library(stringr)
require(reshape2)

## Loading required package: reshape2

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

require(tidyr)
require(readr)

p <- read_file("liwc/positive.txt")
n <- read_file("liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
head(LIWC_ch, 5)

##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive

計算詞頻，如果詞彙只有一個字(如:請、年..) 或出現次數小於10，則不列入計算最後先選出詞頻最高的20個關鍵字，由大到小排序

tokens <-media_token

word_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(word_count, 20)

## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 韓國瑜 10171
##  2 柯文哲  8908
##  3 市長    7416
##  4 高雄    6395
##  5 下載    5306
##  6 選舉    4853
##  7 選戰    4439
##  8 陳其邁  4398
##  9 姚文智  3879
## 10 民進黨  3751
## 11 侯友宜  3487
## 12 台北    3144
## 13 丁守中  2909
## 14 蘇貞昌  2774
## 15 國民黨  2556
## 16 合一    2341
## 17 直播    2115
## 18 政治    2099
## 19 候選人  1853
## 20 台中    1843

接著依據民進黨、國民黨、相對中立把斷詞後的數據標上blue、green、mid等政黨傾向然後再次合併處理

blue_tokens <- tokens %>%
  filter(media== "China Times/中視新聞/中天新聞52家族/中時即影音" | media=="聯合新聞網/聯合報" | media == "自由時報" | media=="TVBS 少康戰情室/TVBS 新聞" | media=="ETtoday筋斗雲/ETtoday新聞雲/東森新聞" | media=="Yahoo!奇摩新聞")%>%
  mutate(policy="blue")

green_tokens<- tokens %>%
  filter(media== "新頭殼newtalk" | media=="民視新聞/政經看民視 民視看正晶" | media == "三立新聞/三立新聞網" | media=="風傳媒" | media=="民報" )%>%
  mutate(policy="green")

mid_tokens<- tokens %>%
  filter(media== "蘋果日報/蘋果日報即時新聞")%>%
  mutate(policy="mid")

tokens <-rbind(mid_tokens,blue_tokens,green_tokens)

word_count <- tokens %>%
  select(Page_Name,Date,word,media,policy)

接著將斷好詞，也分類好的媒體資料和情緒字典做inner_join 留下Page_Name, Date, sentiment,media,policy 此處情緒為正向減去負向情緒之總和

下圖為全年依政黨傾向分類之媒體情緒圖藍色為中立，紅色為親藍媒體，綠色為親綠媒體

word_count <- tokens %>%
  inner_join(LIWC_ch) %>%
  count(Page_Name, Date, sentiment,media,policy) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>% 
  mutate(color = ifelse(sentiment < 0, "negative","positive"))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

ggplot(word_count, aes(Date, sentiment, fill = policy)) +
  geom_col(show.legend = FALSE)

因綜合圖表不好比較，故將各家媒體挑出先以粉絲專頁為類別檢視，固定xy軸單位方便比較可觀查到同一媒體集團內，不同管道仍有不同報載新聞模式

如雖同為蘋果日報，即時新聞的情緒字眼相較本頁就少許多

而因為所抓取資料及為政治相關報導各家媒體情緒也隨11月選舉到來而逐漸升高

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Page_Name, ncol = 4, scales = "fixed")

#沒有資料的日期將count設為0
plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=policy))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~Page_Name, ncol = 4, scales = "fixed")

plot_table

接著檢視合併同一集團媒體後，所出現的情緒分布可發現相對其他媒體中天新聞播送總情緒，較其他媒體高出許多而聯合報系情緒圖就相對平緩許多

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~media, ncol = 4, scales = "fixed")

#沒有資料的日期將count設為0
plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=policy))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~media, ncol = 4, scales = "fixed")

plot_table

接著將各家媒體依據藍、綠、中立合併，觀察其情緒分布可發現情緒皆因選具而逐漸升高但因所選政治傾向媒體家數不同，下階段各選取一家作為分析

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~policy, ncol = 1, scales = "fixed")

plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~policy, ncol = 1, scales = "fixed")

plot_table

選出不同政治傾向中，情緒起伏最大的媒體 +民報 +China Times +蘋果日報即時新聞

select_word_count <- word_count %>%
  filter(Page_Name == "民報" | Page_Name == "蘋果日報 台灣" | Page_Name == "China Times")

ggplot(select_word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Page_Name, ncol = 1, scales = "fixed")

Contribution to sentiment

整體情緒正負面貢獻詞彙分析

word_num <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

word_num %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

將不同政治傾向媒體分開檢視

  word_num_blue <- blue_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="blue")

  word_num_green <- green_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="green")


  word_num_mid <- mid_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))%>%
  mutate(policy="mid")

word_num_blue %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "blue Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

word_num_green %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "green Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

word_num_mid %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "mid Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

tf_idf 用全部的發言下去做好像有點不太公平用各篇文章分開來算

word_num_all <-rbind(word_num_blue,word_num_green,word_num_mid)

total_words <- word_num_all %>% 
  group_by(policy) %>% 
  summarize(total = sum(sum))

word_num_all <- left_join(word_num_all, total_words) %>%
  bind_tf_idf(word, policy, sum)%>%
  select(-total) %>%
  arrange(desc(tf_idf))

## Joining, by = "policy"

word_num_all

## # A tibble: 5,771 x 6
##    word     sum policy      tf   idf  tf_idf
##    <chr>  <int> <chr>    <dbl> <dbl>   <dbl>
##  1 精采     241 mid    0.00992  1.10 0.0109 
##  2 知識      81 mid    0.00333  1.10 0.00366
##  3 脫節      80 mid    0.00329  1.10 0.00362
##  4 拒當      77 mid    0.00317  1.10 0.00348
##  5 方念華   454 blue   0.00300  1.10 0.00330
##  6 新東     448 blue   0.00296  1.10 0.00326
##  7 募集     383 blue   0.00253  1.10 0.00278
##  8 談大     378 blue   0.00250  1.10 0.00275
##  9 編蝠     364 blue   0.00241  1.10 0.00265
## 10 超夯     271 blue   0.00179  1.10 0.00197
## # ... with 5,761 more rows

因為小編名、新聞廣告字，在各家媒體出現頻率頗高加入停用字去除，檢視排名較高的tf_idf

mystopwords <- tibble(word = c("少康","戰情","點到","每晚","風雲","糯米","江湖","編肉","新東","談大","蝙蝠","豬頭皮","前線","蘋果","蘋果日報","快遞","盡在","精彩","好聞","天下事","拒當","魯窮宅","娜娜","鬼編","圖文","資訊","頭殼","風傳","新聞網","生活圈","速報","投稿","高畫質","文章","百里","文章","專屬","都在","這裡","優質","漏接","觀點","國會草知識","知識","歡迎","盡在","天下事","點到","每晚","少康","戰情","黑寡婦","即時","慘編","鄉編","八方","聊天室","蝙蝠","超夯","小丸子","安博","精彩","立即","專題","要聞","旁編","作者","客訴","金鋼","大安","林口","雷神","方便","分享","天天","媒體","編蝠","快點","編錯","精彩","那編","咖編","不斷更新","限量","對此","新鮮事","熱門話題","蝙蝠俠","最新報導","胖丁","東區","淡水"))

word_num_all <- anti_join(word_num_all, mystopwords, by = "word")

word_num_all %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(policy) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = policy)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~policy, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by tf_idf

+選戰策略與群眾脫節 /小英執政與民意有脫節

+拒當台南川普/拒當女柯p

+體檢交通/青年政策

+火星劉世芳批大陸勢力滲透韓國瑜酸：下次就換火星人來了

mid_project_social

Annrison