#Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
前言 台灣第15任總統、副總統選舉,於2020年1月11日舉行。在競選期間,不同世代的人之間的訴求和輿論,均牽動選情。民眾為了落實公民權的行使,以及表述各方立場,候選人及相關政黨議題不斷被分享,創造網路高聲量。網路取代傳統傳播大幅影響為政治競爭型態。因此,本組欲探究網路的「政治影響力」到底有多大?透過PTT資料進行候選人及政黨的詞頻計算找出關鍵字,以及輿情正向、反向的情緒分析。
Tsai_data = fread('Tsai_artWordFreq.csv',encoding = 'UTF-8') #蔡英文
Hen_data = fread('Hen_artWordFreq.csv',encoding = 'UTF-8') #韓國瑜
Song_data = fread('Song_artWordFreq.csv',encoding = 'UTF-8') #宋楚瑜
DPP_data = fread('DPP_artWordFreq.csv',encoding = 'UTF-8') #民進黨
KMT_data = fread('KMT_artWordFreq.csv',encoding = 'UTF-8') #國民黨
NPP_data = fread('NPP_artWordFreq.csv',encoding = 'UTF-8') #時代力量
PFP_data = fread('PFP_artWordFreq.csv',encoding = 'UTF-8') #親民黨
TPP_data = fread('TPP_artWordFreq.csv',encoding = 'UTF-8') #民眾黨
head(Tsai_data)
head(Hen_data)
head(Song_data)
##過濾特殊字元
###總統
Tsai_data = Tsai_data %>%
filter(!grepl('_',word))
Tsai_data = Tsai_data %>%
filter(!(word %in% c("https")))
Hen_data = Hen_data %>%
filter(!grepl('_',word))
Hen_data = Hen_data %>%
filter(!(word %in% c("https")))
Song_data = Song_data %>%
filter(!grepl('_',word))
Song_data = Song_data %>%
filter(!(word %in% c("https")))
###政黨
DPP_data = DPP_data %>%
filter(!grepl('_',word))
DPP_data = DPP_data %>%
filter(!(word %in% c("https")))
KMT_data = KMT_data %>%
filter(!grepl('_',word))
KMT_data = KMT_data %>%
filter(!(word %in% c("https")))
NPP_data = NPP_data %>%
filter(!grepl('_',word))
NPP_data = NPP_data %>%
filter(!(word %in% c("https")))
PFP_data = PFP_data %>%
filter(!grepl('_',word))
PFP_data = PFP_data %>%
filter(!(word %in% c("https")))
TPP_data = TPP_data %>%
filter(!grepl('_',word))
TPP_data = TPP_data %>%
filter(!(word %in% c("https")))
###總統
Tsai_data$artDate= Tsai_data$artDate %>% as.Date("%Y/%m/%d")
Hen_data$artDate= Hen_data$artDate %>% as.Date("%Y/%m/%d")
Song_data$artDate= Song_data$artDate %>% as.Date("%Y/%m/%d")
###政黨
DPP_data$artDate= DPP_data$artDate %>% as.Date("%Y/%m/%d")
KMT_data$artDate= KMT_data$artDate %>% as.Date("%Y/%m/%d")
NPP_data$artDate= NPP_data$artDate %>% as.Date("%Y/%m/%d")
PFP_data$artDate= PFP_data$artDate %>% as.Date("%Y/%m/%d")
TPP_data$artDate= TPP_data$artDate %>% as.Date("%Y/%m/%d")
###總統
word_count_Tsai <- Tsai_data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count_Hen <- Hen_data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count_Song <- Song_data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
###政黨
word_count_DPP <- DPP_data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count_KMT <- KMT_data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count_NPP <- NPP_data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count_PFP <- PFP_data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count_TPP <- TPP_data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count_Tsai
word_count_Hen
word_count_Song
詞頻 三筆候選人的資料中可看到最常被提及的關鍵字為候選人的姓名,再者是競爭對手、政黨關鍵詞。蔡英文及韓國瑜常被媒體作為彼此論述的議題,因此可看到兩者的關鍵詞中出現對方的姓名、政黨。而在宋楚瑜則是出現柯文哲此關鍵字,原因可推究為當時親民黨常提出小黨合作等話題。
word_count_Tsai %>% wordcloud2()
word_count_Hen %>% wordcloud2()
word_count_Song %>% wordcloud2()
# 正向字典txt檔
# 以,將字分隔
P <- read_file("liwc/positive.txt")
# 負向字典txt檔
N <- read_file("liwc/negative.txt")
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
word_count_Tsai %>% inner_join(LIWC)
word_count_Hen %>% inner_join(LIWC)
word_count_Song %>% inner_join(LIWC)
候選人正負向情緒字 在候選人正向關鍵字中最常出現的為「支持、希望、自由」,負向關鍵字為「問題、批評」多與選舉相關,而在韓國瑜的負向情緒字中出現「遲到」。
word_count_DPP %>% inner_join(LIWC)
word_count_KMT %>% inner_join(LIWC)
word_count_NPP %>% inner_join(LIWC)
word_count_PFP %>% inner_join(LIWC)
word_count_TPP %>% inner_join(LIWC)
政黨正負向情緒字 政黨的正負向關鍵字跟候選人相似。在親民黨的資料集中出現「美人」此用詞,經分析後發現為該政黨發言人姓名。另外,在各政黨的關鍵字數量中可以發現以民進黨、時代力量、民眾黨較多,推論為該政黨支持者較常使用網路進行討論,以致關鍵字數量高於國民黨、親民黨。
###總統
sentiment_count_Tsai = Tsai_data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
sentiment_count_Hen = Hen_data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
sentiment_count_Song = Song_data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
###政黨
sentiment_count_DPP = DPP_data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
sentiment_count_KMT = KMT_data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
sentiment_count_NPP = NPP_data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
sentiment_count_PFP = PFP_data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
sentiment_count_TPP = TPP_data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
sentiment_count_Tsai %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
sentiment_count_Hen %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
sentiment_count_Song %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
候選人正負向情緒 三位候選人的正負向情緒可以看出多數以正向情緒較高,可推究為選舉期間多數議題雖為爭議性取向,但在選詞上乃為正向。因此,整題觀察下可看到多為正向情緒。另外,蔡英文及韓國瑜較早提出參選計畫,網路討論聲量相較於宋楚瑜較早出現。
sentiment_count_Tsai %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count_Tsai$artDate == as.Date('2019/08/05'))[1]])),colour = "yellow",size=1)+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count_Tsai$artDate == as.Date('2020/01/11'))[1]])),colour = "yellow",size=1)
##8月5日負面情緒飆升
蔡英文正負向情緒 2019/8/5柯文哲組新政黨後,指控蔡英文總統身邊的人「每個人都貪汙」,引起與論譁然,總統府也立即要求柯文哲釐清及道歉。因此,當日出現較高的負向情緒。而2020/1/11當日蔡英文贏得該屆總統選舉,並且以高得票率打破先前競選紀錄。因此,正向情緒比負向情緒高出許多。
Tsai_data %>%
filter(artDate == as.Date('2019/08/05')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>20) %>% ####過濾出現太少次的字
wordcloud2()
##柯文哲表示蔡英文身邊的人都貪汙
sentiment_count_TPP %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count_Hen$artDate == as.Date('2019/08/05'))
[1]])),colour = "yellow",size=1)
###同日台民黨、柯文哲的討論度也提高
民眾黨正負向情緒 柯文哲組織「台灣民眾黨」搶攻立法院席次,然而是否有意要參選總統、會不會有「郭柯配」、「柯郭配」引起大家關注。以及柯文哲在2019/8/5上午先大罵韓國瑜發大財是喊口號,又批評蔡英文沒貪汙,但身邊每個人都貪汙,甚至砲轟深綠團體是假義和團。使得此日讓民眾黨輿論聲量達到最高峰。
sentiment_count_Hen %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count_Hen$artDate == as.Date('2019/12/29'))
[1]])),colour = "yellow",size=1)
韓國瑜正負情緒 2019/12/29韓國瑜參與台灣總統大選電視辯論會,以及當日晚上參加台中造勢晚會,在大雨造勢、辯論會火力全開,聲量創一年新高。首度拋出「台灣六塊肌」政策,韓國瑜表示,台灣分為六大區塊,發展不同區域特色與產業,要讓台灣重新被國際社會重視,以及提出「滿天星」計劃,培養年輕人出國交換一年,軍公教警消每年帶職帶薪出國進修。然而,網友認為這些政策內容較為空泛。因此,當日出現負向情緒最高。
Hen_data %>%
filter(artDate == as.Date('2019/12/29')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>20) %>% ####過濾出現太少次的字
wordcloud2()
###辯論會導致韓國瑜的聲量變高
sentiment_count_Song %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count_Song$artDate == as.Date('2019/11/12'))
[1]])),colour = "yellow",size=1)
宋楚瑜正負向情緒 2019/11/12宋楚瑜提出參選2020總統,引發網路一片熱議,而對於宋楚瑜5度參選一事,許多網友表示真的是「活到老選到老」、「有選舉那年,就有宋楚瑜,這就是年年有瑜」、「用參選陪伴著我們長大」、「遲到但永不缺席的男子」。讓正向情緒達到高點。
Song_data %>%
filter(artDate == as.Date('2019/11/12')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>10) %>% # 過濾出現太少次的字
wordcloud2()
## 11/13 宋楚瑜宣布參選總統
sentiment_count_TPP %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count_TPP$artDate == as.Date('2019/10/03'))
[1]])),colour = "yellow",size=1)
民眾黨正負向情緒 2019/10/3民眾黨提出推出不分區立委海選計畫,然而區域立委候選人名單被質疑具有「綠營背景」,遭郭台銘拒絕合照。因此,當日的負面情緒較高。
Tsai_data %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(count = n()) %>%
data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=12))+
coord_flip()
Hen_data %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(count = n()) %>%
data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=12))+
coord_flip()
Song_data %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(count = n()) %>%
data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=12))+
coord_flip()
總統候選人標題及留言數分析
## 載入總統候選人標題文
Tsai_article = fread('Tsai_articleMetaData.csv',encoding = 'UTF-8')
Hen_article = fread('Hen_articleMetaData.csv',encoding = 'UTF-8')
commentNum_Tsai <- Tsai_article %>%
filter(!is.na(as.numeric(commentNum)))%>%
group_by(artDate) %>%
summarise(comment_Tsai = sum(as.numeric(commentNum)))
articleNum_Tsai <- Tsai_article %>%
filter(!is.na(as.numeric(commentNum))) %>%
group_by(artDate) %>%
summarise(article_Tsai = n())
commentNum_Hen <- Hen_article %>%
filter(!is.na(as.numeric(commentNum)))%>%
group_by(artDate) %>%
summarise(comment_Hen = sum(as.numeric(commentNum)))
articleNum_Hen <- Hen_article %>%
filter(!is.na(as.numeric(commentNum))) %>%
group_by(artDate) %>%
summarise(article_Hen = n())
commentNum_Hen %>%
inner_join(articleNum_Hen) %>%
inner_join(commentNum_Tsai) %>%
inner_join(articleNum_Tsai) %>%
ggplot()+
geom_line(aes(x=as.Date(artDate),y=(comment_Tsai),color="蔡英文留言數"))+
geom_line(aes(x=as.Date(artDate),y=(comment_Hen),color="韓國瑜留言數"))+
geom_line(aes(x=as.Date(artDate),y=(article_Tsai),color="蔡英文文章數"))+
geom_line(aes(x=as.Date(artDate),y=(article_Hen),color="韓國瑜文章數"))+
scale_colour_manual(values=c("#15851c","#ffb940","#0d31bf","#5dc2fc"))+
scale_x_date(labels = date_format("%m/%d"))+
scale_y_log10()
###韓國瑜文章的留言數明顯多於蔡英文,討論度較高
候選人網路聲量 整體觀察下,網路聲量大小與選舉結果不成正比,韓國瑜的總留言數、總文章數大多超過蔡英文,依舊是政治人物中的聲量王,然而選舉的結果卻是敗選,可推竟網路聲量能居高不下,可能是由爭議造成的負面聲量堆疊而成。
commentNum_Hen %>%
inner_join(articleNum_Hen) %>%
inner_join(commentNum_Tsai) %>%
inner_join(articleNum_Tsai) %>%
ggplot()+
geom_line(aes(x=as.Date(artDate),y=(comment_Tsai / article_Tsai),color="Tsai"))+
geom_line(aes(x=as.Date(artDate),y=(comment_Hen / article_Hen),color="Hen"))+
scale_x_date(labels = date_format("%m/%d"))
###但也因為討論韓國瑜的文章較多,因此韓國瑜文章的平均留言數並沒有比較多
候選人網路聲量 當我們用總言數/總文章數來看,韓國瑜的文章平均留言數並沒有比較多。雖然討論韓國瑜的文章較多,但留言數量並不多。