時間篩選:2018-06-01-2018-11-31
篩選12家常見臉書新聞媒體做貼文分析,其中依據政黨傾向分類如下, 惟其中政黨傾向有程度之別,將在後續分析。
【媒體政治傾向分類】
packages = c("readr","tm", "data.table", "dplyr", "stringr", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot", "htmlwidgets")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(servr)
## Loading required package: servr
#設定中文編碼,並匯入套件
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## [1] ""
packages = c("dplyr","ggplot2", "data.table", "scales", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr","bigmemory","corrplot","ggpubr","topicmodels","jiebaRD")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(scales)
## Loading required package: scales
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(ggplot2)
## Loading required package: ggplot2
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(scales)
library(dplyr)
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## corrplot 0.84 loaded
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggpubr)
## Loading required package: magrittr
library(topicmodels)
library(tidytext)
library(jiebaRD)
library(jiebaR)
library(stringr)
require(reshape2)
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
require(tidyr)
## Loading required package: tidyr
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
## The following object is masked from 'package:magrittr':
##
## extract
require(readr)
## Loading required package: readr
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
讀取各月份資料
csv06 <- fread("C:/Users/user/Desktop/R/2019/201806_data.csv", encoding = "UTF-8")
csv07 <- fread("C:/Users/user/Desktop/R/2019/201807_data.csv", encoding = "UTF-8")
csv08 <- fread("C:/Users/user/Desktop/R/2019/201808_data.csv", encoding = "UTF-8")
csv09 <- fread("C:/Users/user/Desktop/R/2019/201809_data.csv", encoding = "UTF-8")
csv10 <- fread("C:/Users/user/Desktop/R/2019/201810_data.csv", encoding = "UTF-8")
csv11 <- fread("C:/Users/user/Desktop/R/2019/201811_data.csv", encoding = "UTF-8")
csv_in_use=rbind(csv06,csv07,csv08,csv09,csv10,csv11)
#轉換資料格式
csv_in_use$Date = csv_in_use$Date %>% as.Date("%Y/%m/%d")
str(csv_in_use) #總覽
## Classes 'data.table' and 'data.frame': 229495 obs. of 18 variables:
## $ Date : Date, format: "2018-06-01" "2018-06-01" ...
## $ Page_Name : chr "柯文哲" "東森新聞" "即新聞" "侯友宜" ...
## $ Page_ID :integer64 136845026417486 124616330906800 525781104253994 261813197541354 354487984641189 339483189800311 240170506141170 102884532662 ...
## $ Link : logi NA NA NA NA NA NA ...
## $ Type : chr "photo" "link" "video" "photo" ...
## $ All_Reaction_Count: int 365168 52932 9572 9063 8584 6963 7098 6145 5843 5222 ...
## $ LIKE_COUNT : int 349049 51380 9155 8912 8038 6902 6201 5768 5333 4923 ...
## $ WOW_COUNT : int 1008 563 39 1 31 5 82 24 6 82 ...
## $ LOVE_COUNT : int 13223 784 264 130 17 42 32 126 415 42 ...
## $ HAHA_COUNT : int 1732 145 83 20 50 11 756 203 83 171 ...
## $ SAD_COUNT : int 99 21 6 0 128 1 10 1 1 3 ...
## $ ANGRY_COUNT : int 57 39 25 0 320 2 17 23 5 1 ...
## $ Comment_Count : int 13879 1481 294 366 232 290 334 149 2174 118 ...
## $ Share_Count : int 25297 1010 875 399 122 171 2317 198 706 418 ...
## $ Message : chr "從今天起,臺北市政府正式脫離千億債務俱樂部。\n\n我剛上任時,接下了1468億元的債務,過了3年半還了差不多520億元,"| __truncated__ "3年還清520億! 柯文哲霸氣宣告:北市脫離千億債務…15萬人淚讚\n#魚干編:柯P就是狂~棒棒<U+0001F44D><U+0001F44D>\n\"| __truncated__ "柯P 3年還520億!!!(#安東)\n【相關報導】\n3年還520億!柯文哲創地方首長最高還債紀錄 13萬人淚讚\nhttps://www.ettod"| __truncated__ "今天南下高雄,鄉親告訴我「很久沒這麼熱了」,說的是天氣,也是人氣。\n\n朋友們的熱情和鼓勵,我都收到了,心暖暖的"| __truncated__ ...
## $ Link_Title : chr "" "3年還清520億! 柯文哲霸氣宣告:北市脫離千億債務…15萬人淚讚" "上任3年還520億債務萬人淚讚 柯文哲嚴格控管預算:做該做的" "Photos from 侯友宜's post" ...
## $ Link Description : chr "" "台北市長柯文哲上任以來還債成績顯著,目前已還債累計達520億,今(1)日在臉書霸氣宣布,北市政府正式脫離千億債務俱"| __truncated__ "" "" ...
## $ created_time :integer64 1527821093000 1527843600000 1527852216000 1527854173000 1527841143000 1527825601000 1527864886000 1527851700000 ...
## - attr(*, ".internal.selfref")=<externalptr>
【相似集團專頁合併】 而同一媒體集團,常有多個頻道分流 在此將各媒體不同粉絲專頁合併,並賦予資料標籤
設定結巴斷詞引擎,並匯入停用字字典 此處user.txt自訂,加入政治人物與政策相關字做斷詞 如韓國瑜、蔡英文、柯p等人物名。
將標籤化後的資料合併,統一斷詞,並過濾數字、英文及“_"
data_media <-rbind(data_new_talk,data_china,data_union,data_et,data_trio,data_wind,data_people,data_free,data_yahoo,data_tvbs,data_apple,data_minsi)
media_token <- data_media %>% unnest_tokens(word, Message, token=policy_tokenizer) %>%
select(Page_Name,word,Date,media) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))%>%
filter(!grepl('_',word))
接著進行各家媒體的情緒分析,使用liwc正負情緒字典
library(stringr)
require(reshape2)
require(tidyr)
require(readr)
p <- read_file("C:/Users/user/Desktop/social_media/mid_project_media/dict/liwc/positive.txt")
n <- read_file("C:/Users/user/Desktop/social_media/mid_project_media/dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
head(LIWC_ch, 5)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
計算詞頻,如果詞彙只有一個字(如:請、年..),或出現次數小於10,則不列入計算;最後先選出詞頻最高的20個關鍵字,由大到小排序。
tokens <-media_token
word_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(word_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 韓國瑜 10171
## 2 柯文哲 8908
## 3 市長 7416
## 4 高雄 6395
## 5 下載 5306
## 6 選舉 4853
## 7 選戰 4439
## 8 陳其邁 4398
## 9 新聞 4299
## 10 姚文智 3879
## 11 民進黨 3751
## 12 侯友宜 3487
## 13 台北 3144
## 14 丁守中 2909
## 15 蘇貞昌 2774
## 16 國民黨 2556
## 17 合一 2341
## 18 蘋果 2229
## 19 直播 2115
## 20 政治 2099
接著依據民進黨、國民黨、相對中立,把斷詞後的數據標上blue、green、mid等政黨傾向,然後再次合併處理。
接著將斷好詞,也分類好的媒體資料,和情緒字典做inner_join,留下Page_Name,Date,sentiment,media,policy,此處情緒為正向減去負向情緒之總和。
下圖為全年依政黨傾向分類之媒體情緒圖(藍色為中立,紅色為親藍媒體,綠色為親綠媒體)
word_count <- tokens %>%
inner_join(LIWC_ch) %>%
count(Page_Name, Date, sentiment,media,policy) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(color = ifelse(sentiment < 0, "negative","positive"))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
ggplot(word_count, aes(Date, sentiment, fill = policy)) +
geom_col(show.legend = FALSE)
因綜合圖表不好比較,故將各家媒體挑出;先以粉絲專頁為類別檢視,固定xy軸單位方便比較;可觀察到同一媒體集團內,不同管道仍有不同報載新聞模式(如雖同為蘋果日報,即時新聞的情緒字眼相較本頁就少許多)
ggplot(word_count, aes(Date, sentiment, fill = color)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Page_Name, ncol = 4, scales = "fixed")
#沒有資料的日期將count設為0
plot_table <- word_count %>%
ggplot()+
geom_line(aes(x=Date,y=sentiment,colour=policy))+
scale_x_date(labels = date_format("%m/%d"))+
facet_wrap(~Page_Name, ncol = 4, scales = "fixed")
plot_table
接著檢視合併同一集團媒體後,所出現的情緒分布 可發現相對其他媒體中天新聞播送總情緒,較其他媒體高出許多 而聯合報系情緒圖就相對平緩許多
ggplot(word_count, aes(Date, sentiment, fill = color)) +
geom_col(show.legend = FALSE) +
facet_wrap(~media, ncol = 4, scales = "fixed")
#沒有資料的日期將count設為0
plot_table <- word_count %>%
ggplot()+
geom_line(aes(x=Date,y=sentiment,colour=policy))+
scale_x_date(labels = date_format("%m/%d"))+
facet_wrap(~media, ncol = 4, scales = "fixed")
plot_table
在各派系媒體分類中,欲求證是否在政治事件的表述上,有親綠媒體或親藍媒體較為情緒化表述的現象,比較後發現,各立場皆 有相對情緒化與相對偏向理性陳述的媒體,加總比較後不相上下。 親藍為china >times、yahoo奇摩新聞、中視新聞情緒起伏較大;親綠則為民報、風傳媒、新頭殼、三立新聞等情緒變化較為明顯, >對於相對政治中立的蘋果日報來說,也因報導內容偏好聳動標題與情緒性用詞,情緒曲線也呈現劇烈變動。其中東森新聞、聯合 報在新聞表現上,則較少使用情緒性用語,而呈現相對平緩的情緒曲線。 總括來看,我們可以發現這些媒體的情緒皆因選具而逐漸升高。 但因所選政治傾向媒體家數不同,下階段各選取一家作為分析
接著將各家媒體依據藍、綠、中立合併,觀察其情緒分布。
ggplot(word_count, aes(Date, sentiment, fill = color)) +
geom_col(show.legend = FALSE) +
facet_wrap(~policy, ncol = 1, scales = "fixed")
plot_table <- word_count %>%
ggplot()+
geom_line(aes(x=Date,y=sentiment,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
facet_wrap(~policy, ncol = 1, scales = "fixed")
plot_table
選出不同政治傾向中,情緒起伏最大的媒體 +民報 +China Times +蘋果日報即時新聞
select_word_count <- word_count %>%
filter(Page_Name == "民報" | Page_Name == "蘋果日報 台灣" | Page_Name == "China Times")
ggplot(select_word_count, aes(Date, sentiment, fill = color)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Page_Name, ncol = 1, scales = "fixed")
word_num <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
word_num %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = sum) %>%
arrange(desc(sum)) %>%
ungroup() %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
將不同政治傾向媒體分開檢視
word_num_blue <- blue_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))%>%
mutate(policy="blue")
word_num_green <- green_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))%>%
mutate(policy="green")
word_num_mid <- mid_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))%>%
mutate(policy="mid")
word_num_blue %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = sum) %>%
arrange(desc(sum)) %>%
ungroup() %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "blue Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
word_num_green %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = sum) %>%
arrange(desc(sum)) %>%
ungroup() %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "green Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
word_num_mid %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = sum) %>%
arrange(desc(sum)) %>%
ungroup() %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "mid Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
在整體情緒正負面貢獻詞中發現,親藍與親綠媒體的正負貢獻詞差異不大。 親藍媒體正向詞多為支持、希望、熱情,負向詞多為尷尬、激烈、緊張、無效; 而親綠媒體正向詞多為支持、分享、希望,負向詞多為批評、違法、落伍等; 相對中立的蘋果正向詞多為不錯、支持、希望,負向詞多為無效、譴責、激烈、抗議。 總括來看,三者結果皆為政治新聞常見詞彙,彼此間沒有太明顯的差異。
用全部的發言下去做好像有點不太公平,所以用各篇文章分開來算
word_num_all <-rbind(word_num_blue,word_num_green,word_num_mid)
total_words <- word_num_all %>%
group_by(policy) %>%
summarize(total = sum(sum))
word_num_all <- left_join(word_num_all, total_words) %>%
bind_tf_idf(word, policy, sum)%>%
select(-total) %>%
arrange(desc(tf_idf))
## Joining, by = "policy"
word_num_all
## # A tibble: 5,819 x 6
## word sum policy tf idf tf_idf
## <chr> <int> <chr> <dbl> <dbl> <dbl>
## 1 蘋果 2194 mid 0.0747 0.405 0.0303
## 2 爭霸戰 601 mid 0.0205 1.10 0.0225
## 3 精采 241 mid 0.00821 1.10 0.00902
## 4 好聞 240 mid 0.00817 1.10 0.00898
## 5 風雲 1039 blue 0.00656 1.10 0.00720
## 6 天下事 162 mid 0.00552 1.10 0.00606
## 7 觀點 918 green 0.00543 1.10 0.00596
## 8 新聞網 678 green 0.00401 1.10 0.00440
## 9 資訊 305 mid 0.0104 0.405 0.00421
## 10 速報 609 green 0.00360 1.10 0.00395
## # ... with 5,809 more rows
因為小編名、新聞廣告字,在各家媒體出現頻率頗高,加入停用字去除,檢視排名較高的tf_idf
mystopwords <- tibble(word = c("少康","戰情","點到","每晚","風雲","糯米","江湖","編肉","新東","談大","蝙蝠","豬頭皮","前線","蘋果","蘋果日報","快遞","盡在","精彩","好聞","天下事","拒當","魯窮宅","娜娜","鬼編","圖文","資訊","頭殼","風傳","新聞網","生活圈","速報","投稿","高畫質","文章","百里","文章","專屬","都在","這裡","優質","漏接","觀點","國會草知識","知識","歡迎","盡在","天下事","點到","每晚","少康","戰情","黑寡婦","即時","慘編","鄉編","八方","聊天室","蝙蝠","超夯","小丸子","安博","精彩","立即","專題","要聞","旁編","作者","客訴","金鋼","大安","林口","雷神","方便","分享","天天","媒體","編蝠","快點","編錯","精彩","那編","咖編","不斷更新","限量","對此","新鮮事","熱門話題","蝙蝠俠","最新報導","胖丁","東區","淡水","方念華","募集","玩樂","中視","老爹","剖析","丹利","官方","大小事","全民","全世界","本日","一手","參與","意見","魔形","孩童","指出","人物","公眾","貓女","專欄","影像","精彩","南勢角","盛竹","一事","快來","投書","萬磁王","士林","爭霸戰","精彩","海報","影創","郭子","關鍵時刻","給你","來點","前哨站","阿尼","就在","條紋","追蹤","史蒂芬","我要","日前","午間","大方","正鯿","鎖定","送票","周周","鎖定","小丑","蘆竹"))
word_num_all <- anti_join(word_num_all, mystopwords, by = "word")
word_num_all %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(policy) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = policy)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~policy, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
各家媒體詞彙的tf idf分析,在刪去小編與報紙名後,可以發現 (1)親藍媒體出現韓總、北農、藍天等,關注由韓國瑜帶起的韓流現象,與九合一大選綠地變藍天的態勢。
(2)而親綠媒體中,「北農」詞彙和韓國瑜在政治宣傳上,大力強調的個人政績有關,「市政、國家、執政」詞彙,推估因民 進黨為執政黨,故和國家執策的相關報導較有關係,
(3)在相對中立的蘋果日報排名中,因本身偏向聳動標題、具高度話題性的新聞,詞彙相對激烈,「脫節」評論藍營或綠營的 施政理念與民眾脫節,「侵台、移民」和兩岸關係相關,「鐵粉」和藍綠兩黨各自的強力擁護者的相關報導,並常用「動未條」 形容政治人物的誇張新聞,如柯文哲和學姊「陪吃說」和上綜藝節目時發生的搞笑事件。而「火星」則是韓國瑜面對綠營質疑, 中國勢力是否滲入高雄選舉的事件中,出現「要是韓國瑜民調6成、陳其邁4成就會說月球人來了,搞不好下次7比3就是火星人來 了」發言所致。