今年於8/28將會進行全國公投,而當中的第17案「您是否同意核四啟封商轉發電?」根據台灣民意基金會4月27日公布民調顯示,支持與反對核四商轉的比例各為43.5%與44%,形成拉鋸戰,核四去留延宕20餘年,現今社群媒體究竟是如何看待此事件。而PPT為全台最大BBS論壇,其中的八卦版與政黑版更是許多人批判政治的地方,因此我們想透過核四事件,比較出八卦版和政黑版在看待此次事件上有何不同。
資料來源:PTT HatePotics版、PTT Gossiping版
資料區間:2020/03/01~2021/05/01 所有文章
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'stringr' was built under R version 3.6.2
## Warning: package 'tidytext' was built under R version 3.6.3
## Warning: package 'wordcloud2' was built under R version 3.6.3
## Warning: package 'data.table' was built under R version 3.6.3
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'reshape2' was built under R version 3.6.2
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer
## Warning: package 'tidyr' was built under R version 3.6.3
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'scales' was built under R version 3.6.2
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 3.6.3
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.6.3
## Warning: package 'openxlsx' was built under R version 3.6.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.6.3
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
## Loading required package: ggraph
## Warning: package 'ggraph' was built under R version 3.6.3
## Loading required package: igraph
## Warning: package 'igraph' was built under R version 3.6.3
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
## Loading required package: widyr
## Warning: package 'widyr' was built under R version 3.6.3
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(grid)
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
} # 把文章和留言的斷詞結果併在一起
goss_token_art <- gossip_article %>% unnest_tokens(word, sentence, token=customized_tokenizer)
goss_token_rev <- gossip_review %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
hate_token_art <- hate_article %>% unnest_tokens(word, sentence, token=customized_tokenizer)
hate_token_rev <-hate_review %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
# 把資料併在一起
goss_token_all <- rbind(goss_token_art[,c("artDate","artUrl", "word")],goss_token_rev[,c("artDate","artUrl", "word")])
hate_token_all <- rbind(hate_token_art[,c("artDate","artUrl", "word")],hate_token_rev[,c("artDate","artUrl", "word")]) # 格式化日期欄位
goss_token_art$artDate = goss_token_art$artDate %>% as.Date("%Y/%m/%d")
goss_token_rev$artDate = goss_token_rev$artDate %>% as.Date("%Y/%m/%d")
goss_token_all$artDate = goss_token_all$artDate %>% as.Date("%Y/%m/%d")
hate_token_art$artDate = hate_token_art$artDate %>% as.Date("%Y/%m/%d")
hate_token_rev$artDate = hate_token_rev$artDate %>% as.Date("%Y/%m/%d")
hate_token_all$artDate= hate_token_all$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
goss_token_art = goss_token_art %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
goss_token_rev = goss_token_rev %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
goss_token_all = goss_token_all %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
hate_token_art = hate_token_art %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
hate_token_rev = hate_token_rev %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
hate_token_all = hate_token_all %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# 計算不同字的詞頻
goss_word_count_art <- goss_token_art %>%
select(word) %>%
group_by(word) %>%
summarise(n=n()) %>% # 算字詞單篇總數用summarise
filter(n>5) %>% # 過濾出現太少次的字
arrange(desc(n))## `summarise()` ungrouping output (override with `.groups` argument)
goss_word_count_rev <- goss_token_rev %>%
select(word) %>%
group_by(word) %>%
summarise(n=n()) %>% # 算字詞單篇總數用summarise
filter(n>5) %>% # 過濾出現太少次的字
arrange(desc(n))## `summarise()` ungrouping output (override with `.groups` argument)
goss_word_count_all <- goss_token_all %>%
select(word) %>%
group_by(word) %>%
summarise(n=n()) %>% # 算字詞單篇總數用summarise
filter(n>5) %>% # 過濾出現太少次的字
arrange(desc(n))## `summarise()` ungrouping output (override with `.groups` argument)
hate_word_count_art <- hate_token_art %>%
select(word) %>%
group_by(word) %>%
summarise(n=n()) %>% # 算字詞單篇總數用summarise
filter(n>5) %>% # 過濾出現太少次的字
arrange(desc(n))## `summarise()` ungrouping output (override with `.groups` argument)
hate_word_count_rev <- hate_token_rev %>%
select(word) %>%
group_by(word) %>%
summarise(n=n()) %>% # 算字詞單篇總數用summarise
filter(n>5) %>% # 過濾出現太少次的字
arrange(desc(n))## `summarise()` ungrouping output (override with `.groups` argument)
hate_word_count_all <- hate_token_all %>%
select(word) %>%
group_by(word) %>%
summarise(n=n()) %>% # 算字詞單篇總數用summarise
filter(n>5) %>% # 過濾出現太少次的字
arrange(desc(n))## `summarise()` ungrouping output (override with `.groups` argument)
在文字雲中看不出兩個版主要的差異
goss_token_all <- goss_token_all %>%
mutate(from = "gossip")
hate_token_all <- hate_token_all %>%
mutate(from = "hate")
all_token <- rbind(goss_token_all[,c("artDate","artUrl", "word","from")],hate_token_all[,c("artDate","artUrl", "word","from")])
all_token_freq <- all_token %>%
mutate(part = ifelse(from == "gossip", "gossip", "hate")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(part, word) %>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `hate`)
ggplot(all_token_freq, aes(x = proportion, y = `gossip`, color = abs(`gossip` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "gossip", x = "hate")## Warning: Removed 24029 rows containing missing values (geom_point).
## Warning: Removed 24030 rows containing missing values (geom_text).
能看出八卦版和政黑版用詞並沒有顯著差異
八卦版較多頻率用詞:較多貶意詞如文組、廢氣、覺青、綠畜、蟑螂等
政黑版較多頻率用詞:較多地名如大安區、文山、新竹、花蓮、台中等
P <- read_file("./liwc/positive.txt") # 正向字典txt檔
N <- read_file("./liwc/negative.txt") # 負向字典txt檔
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
goss_sentiment_count_all = goss_token_all %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) ## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
gossip_LIWC=goss_sentiment_count_all %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-01','2021-05-01')),
breaks ="5 day") +
ggtitle('八卦版正負情緒分數折線圖_LIWC')
hate_sentiment_count_all = hate_token_all %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) ## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
hate_LIWC=hate_sentiment_count_all %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-01','2021-05-01')),
breaks ="5 day") +
ggtitle('政黑版正負情緒分數折線圖_LIWC')
multiplot(gossip_LIWC,hate_LIWC)## Warning: Removed 22 row(s) containing missing values (geom_path).
## Warning: Removed 20 row(s) containing missing values (geom_path).
>能看出八卦版在討論關於核能的部分相較於政黑版較有討論數量的起伏
3/10~3/12八卦版主要引起討論的文章為
- 蔡英文說重啟核四絕對不是選項
- 民進黨民調8成民眾支持綠電取代核電
- 民眾黨將定調公投態度,不支持核四公投
此些文章主要都為反核文章,
-> 八卦版負面大於正面
-> 政黑版正面大於負面
3/16~3/17高峰原因為
- 侯友宜問蘇貞昌核廢料要放新北多久 蘇貞昌:台灣沒地方願收容
- 反核派嗆「核廢料放你家」 徐巧芯反酸:燃煤空污可排到反核人士家?
- 任內「親手將核四封存」下台又高喊重啟 黃捷批馬英九:可笑又無能
此時主要討論為核廢料的收容,
-> 八卦版負面大於正面
-> 政黑版正面大於負面
3/28~3/30高峰原因為
- [新聞] 重啟無望!核四最後一批燃料棒送往美國
- 歐盟科學諮詢機構:核電符合綠能投資
- 美麗島電子報民調》反核成過去式?4成5台灣人認為核電利大於弊
- 蘇揆:無法處理核廢料就沒資格談核能
-> 八卦版負面皆大於正面
-> 政黑版情緒接近
4/9~4/12
- 4/9 苗博雅問核四商轉公投?柯文哲:愚人問題智者無法回答
- 4/11 核四問題再嗆苗博雅 柯文哲:帶頭搞意識形態 讓人很不高興
- 4/11 反核嗆柯! 黃捷:沒理念的黨主席還能推動什麼價值?
-> 八卦版與政黑版負面皆大於正面,而八卦版負面字詞相較多更多
4/23 高峰原因為
- 林飛帆:降空污、減碳、反核、國民黨別亂
-> 八卦版並沒有明顯討論
4/27 高峰原因為
- 糗了!謝長廷秀台灣排核廢水證據 被抓包
- 蔡英文轟馬英九:7年前封存核四 現在卻改變立場
-> 八卦版負面大於正面
-> 政黑版並沒有明顯討論
gossip_LIWC_ratio = goss_sentiment_count_all %>%
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-01','2021-05-01')),
breaks ="5 day") +
ggtitle('八卦版正負情緒比例折線圖_LIWC')
hate_LIWC_ratio=hate_sentiment_count_all %>%
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-01','2021-05-01')),
breaks ="5 day") +
ggtitle('政黑版正負情緒比例折線圖_LIWC')
multiplot(gossip_LIWC_ratio,hate_LIWC_ratio)## Warning: Removed 22 row(s) containing missing values (geom_path).
## Warning: Removed 20 row(s) containing missing values (geom_path).
>以比例來看,八卦版負面情緒多大於正面情緒
而政黑版在4/3後議題主要為苗博雅質詢柯文哲議題,多為負面情緒
goss_sentiment_word_count_art = goss_word_count_art %>%
inner_join(LIWC) %>%
arrange(desc(n)) %>%
data.frame() #存成data frame ## Joining, by = "word"
goss_sentiment_word_count_rev = goss_word_count_rev %>%
inner_join(LIWC) %>%
arrange(desc(n)) %>%
data.frame() #存成data frame ## Joining, by = "word"
goss_sentiment_word_count_all = goss_word_count_all %>%
inner_join(LIWC) %>%
arrange(desc(n)) %>%
data.frame() #存成data frame ## Joining, by = "word"
hate_sentiment_word_count_art = hate_word_count_art %>%
inner_join(LIWC) %>%
arrange(desc(n)) %>%
data.frame() #存成data frame ## Joining, by = "word"
hate_sentiment_word_count_rev = hate_word_count_rev %>%
inner_join(LIWC) %>%
arrange(desc(n)) %>%
data.frame() #存成data frame ## Joining, by = "word"
hate_sentiment_word_count_all = hate_word_count_all %>%
inner_join(LIWC) %>%
arrange(desc(n)) %>%
data.frame() #存成data frame ## Joining, by = "word"
gossip_LIWC_col =goss_sentiment_word_count_all %>%
top_n(30,wt = n) %>%
mutate(word = reorder(word, n)) %>% #重新排序word,
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(family="", size = 14))+
coord_flip()+
ggtitle('八卦版_LIWC')
hate_LIWC_col = hate_sentiment_word_count_all %>%
top_n(30,wt = n) %>%
mutate(word = reorder(word, n)) %>% #重新排序word,
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(family="", size = 14))+
coord_flip()+
ggtitle('政黑版_LIWC')
gossip_LIWC_colCVAW4_org <- read.csv("cvaw412.csv",header=T,stringsAsFactors = FALSE,encoding = "UTF-8")
#為方便處理轉換成 -4 到 4 ( 原本為 1 到 9 )
CVAW4_org = CVAW4_org %>% mutate(Valence_Mean=Valence_Mean-5,Arousal_Mean=Arousal_Mean-5)
plot_ly(data=CVAW4_org,x=~Valence_Mean,y=~Arousal_Mean,text = ~Word,
marker = list(size = 7,
color = 'rgba(182, 192, 255, .9)',
line = list(color = 'rgba(0, 0, 152, .8)',
width = 2))) %>%
layout(
title = "CVAW4",
xaxis = list(title = "情緒值"),
yaxis = list(title = "亢奮值")
)## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
goss_sentiment_word_count_artc = gossip_article %>%
group_by(artTitle,artDate) %>%
unnest_tokens(word, sentence, token=customized_tokenizer) %>%
ungroup() %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1) %>%
select(artTitle,artDate,word) %>%
group_by(artTitle,artDate,word) %>%
summarise(n=n()) %>% # 算字詞單篇總數用summarise
ungroup() %>%
#filter(n>5) %>% # 過濾出現太少次的字
inner_join(CVAW4_org,by=c("word"="Word")) %>%
group_by(word) %>% #這個是這篇文章有一樣的字
mutate(Valence_per_url=mean(n*Valence_Mean),Arousal_per_url=mean(n*Arousal_Mean)) %>%
ungroup() %>%
group_by(artTitle,artDate) %>% #我選同一篇文章
mutate(cv=mean(n*Valence_Mean),cv2=mean(n*Arousal_Mean)) %>%
arrange(desc(n)) ## `summarise()` regrouping output by 'artTitle', 'artDate' (override with `.groups` argument)
goss_sentiment_word_count_artc$artDate = goss_sentiment_word_count_artc$artDate %>% as.Date("%Y/%m/%d")
#將離群值排除
tmp = goss_sentiment_word_count_artc %>%
filter(cv2>=quantile(cv2,0.995)|cv2<=quantile(cv2,0.005)|
cv>=quantile(cv,0.995)|cv<=quantile(cv,0.005))
plot_ly( data = goss_sentiment_word_count_artc ,x=~cv ,y=~cv2 ,text = ~artTitle,type = 'scatter', mode = 'markers') %>%
layout(
title = "八卦版中各篇文章",
xaxis = list(title = "情緒值(越大越正面)"),
yaxis = list(title = "亢奮值(越大越亢奮)")
)八卦版多數的文章較為冷靜、並且負面略大於正面
hate_sentiment_word_count_artc = hate_article %>%
group_by(artTitle,artDate) %>%
unnest_tokens(word, sentence, token=customized_tokenizer) %>%
ungroup() %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1) %>%
select(artTitle,artDate,word) %>%
group_by(artTitle,artDate,word) %>%
summarise(n=n()) %>% # 算字詞單篇總數用summarise
ungroup() %>%
#filter(n>5) %>% # 過濾出現太少次的字
inner_join(CVAW4_org,by=c("word"="Word")) %>%
group_by(word) %>% #這個是這篇文章有一樣的字
mutate(Valence_per_url=mean(n*Valence_Mean),Arousal_per_url=mean(n*Arousal_Mean)) %>%
ungroup() %>%
group_by(artTitle,artDate) %>% #我選同一篇文章
mutate(cv=mean(n*Valence_Mean),cv2=mean(n*Arousal_Mean)) %>%
arrange(desc(n)) ## `summarise()` regrouping output by 'artTitle', 'artDate' (override with `.groups` argument)
hate_sentiment_word_count_artc$artDate = hate_sentiment_word_count_artc$artDate %>% as.Date("%Y/%m/%d")
#將離群值排除
tmp = hate_sentiment_word_count_artc %>%
filter(cv2>=quantile(cv2,0.995)|cv2<=quantile(cv2,0.005)|
cv>=quantile(cv,0.995)|cv<=quantile(cv,0.005))
plot_ly( data = hate_sentiment_word_count_artc ,x=~cv ,y=~cv2 ,text = ~artTitle,type = 'scatter', mode = 'markers') %>%
layout(
title = "政黑版中各篇文章",
xaxis = list(title = "情緒值(越大越正面)"),
yaxis = list(title = "亢奮值(越大越亢奮)")
)政黑版也是多數的文章較為冷靜、並且正負面也略為相同
## `summarise()` ungrouping output (override with `.groups` argument)
goss_sentiment_word_count_artc_cvaw = goss_sentiment_word_count_artc %>%
group_by(artDate,artTitle) %>%
mutate(sss1 = mean(n*Valence_Mean)) %>% #每篇的情緒
mutate(sss2 = mean(n*Arousal_Mean)) %>%
select(artDate,sss1,sss2) %>%
group_by(artDate) %>%
mutate(ss1 = mean(sss1)) %>% #每一天的情緒
mutate(ss2 = mean(sss2)) %>%
select(artDate,ss1,ss2) %>%
unique() %>%
inner_join(tem_1,by=("artDate" = "artDate"))## Adding missing grouping variables: `artTitle`
pic1 <- goss_sentiment_word_count_artc_cvaw %>%
#group_by(artDate) %>%
ggplot()+
labs(x = "Date" , y = "Value",title = "八卦版_CVAW => blue:情緒值 / red:亢奮值") +
theme(text=element_text(family="", size = 10))+
geom_line(aes(x=artDate,y=ss1),colour="blue",size=1.5)+
geom_line(aes(x=artDate,y=ss2),colour="red",size=1.5)+
geom_hline(yintercept = 0 ,size=1.5)+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-01','2021-05-01')),
breaks ="5 day") ## `summarise()` ungrouping output (override with `.groups` argument)
hate_sentiment_word_count_artc_cvaw = hate_sentiment_word_count_artc %>%
group_by(artDate,artTitle) %>%
mutate(sss1 = mean(n*Valence_Mean)) %>% #每篇的情緒
mutate(sss2 = mean(n*Arousal_Mean)) %>%
select(artDate,sss1,sss2) %>%
group_by(artDate) %>%
mutate(ss1 = mean(sss1)) %>% #每一天的情緒
mutate(ss2 = mean(sss2)) %>%
select(artDate,ss1,ss2) %>%
unique() %>%
inner_join(tem_2,by=("artDate" = "artDate"))## Adding missing grouping variables: `artTitle`
pic2 <- hate_sentiment_word_count_artc_cvaw %>%
#group_by(artDate) %>%
ggplot()+
labs(x = "Date" , y = "Value",title = "政黑版_CVAW => blue:情緒值 / red:亢奮值") +
theme(text=element_text(family="", size = 10))+
geom_line(aes(x=artDate,y=ss1),colour="blue",size=1.5)+
geom_line(aes(x=artDate,y=ss2),colour="red",size=1.5)+
geom_hline(yintercept = 0 ,size=1.5)+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-01','2021-05-01')),
breaks ="5 day")
multiplot(pic1,pic2)## Warning: Removed 11 row(s) containing missing values (geom_path).
## Warning: Removed 11 row(s) containing missing values (geom_path).
## Warning: Removed 9 row(s) containing missing values (geom_path).
## Warning: Removed 9 row(s) containing missing values (geom_path).
> 每日平均每篇文章平均每篇評論的情緒值和亢奮值,
- 能看出八卦版每日平均文章分數接近0但偏負面,約為0~-1,亢奮值也偏冷靜,約為0~-1
- 政黑板相較於八卦版文章較為正面,分數約為-1~3,而亢奮值也偏冷靜,約為0~-3
gossip_R <- gossip_review %>%
mutate(sentence=gsub("[\n]{2,}", "。", cmtContent)) %>%
mutate(sentence=gsub("\n", "", cmtContent)) %>%
mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", cmtContent))
gossip_M <- gossip_article %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>%
mutate(sentence=gsub("\n", "", sentence)) %>%
mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))
black_R <- hate_review %>%
mutate(sentence=gsub("[\n]{2,}", "。", cmtContent)) %>%
mutate(sentence=gsub("\n", "", cmtContent)) %>%
mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", cmtContent))
black_M <- hate_article %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>%
mutate(sentence=gsub("\n", "", sentence)) %>%
mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))
black_data <- rbind(black_M[,c("artDate","artUrl", "sentence")],black_R[,c("artDate","artUrl", "sentence")])
gossip_data <- rbind(gossip_M[,c("artDate","artUrl", "sentence")],gossip_R[,c("artDate","artUrl", "sentence")])
black_data <- black_data %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|1.新聞網址︰ 新聞來源︰ETtoday新聞雲", "", sentence))
gossip_data <- gossip_data %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|1.新聞網址︰ 新聞來源︰ETtoday新聞雲", "", sentence))
gossip_data_sentences <- strsplit(gossip_data$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
gossip_data_sentences <- data.frame(
artUrl = rep(gossip_data$artUrl, sapply(gossip_data_sentences, length)),
sentence = unlist(gossip_data_sentences)
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
gossip_data_sentences$sentence <- as.character(gossip_data_sentences$sentence)
black_data_sentences <- strsplit(black_data$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
black_data_sentences <- data.frame(
artUrl = rep(black_data$artUrl, sapply(black_data_sentences, length)),
sentence = unlist(black_data_sentences)
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
black_data_sentences$sentence <- as.character(black_data_sentences$sentence)
chi_tokenizer <- function(t){
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}gossip_data_words <- gossip_data %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)%>%
left_join(gossip_M,by="artUrl")%>%
select(artTitle,artUrl,word,n)
black_data_words <- black_data %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)%>%
left_join(black_M,by="artUrl")%>%
select(artTitle,artUrl,word,n)## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "artUrl"
## Joining, by = "artUrl"
black_data_words_tf_idf <- black_data_words %>%
bind_tf_idf(word, artUrl, n)
gossip_data_words_tf_idf <- gossip_data_words %>%
bind_tf_idf(word, artUrl, n)gossip_data_words_tf_idf %>%
group_by(artUrl) %>%
filter(n>15)%>%
slice_max(tf_idf, n=5) %>%
arrange(desc(tf_idf))%>%
head(10)## # A tibble: 10 x 8
## # Groups: artUrl [10]
## artTitle artUrl word n total tf idf tf_idf
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Re:[新聞]定調公投態度民眾黨將~ https://www.ptt.cc/~ 草包 16 217 0.0737 4.30 0.317
## 2 [問卦]「現階段反核的是低端」,怎~ https://www.ptt.cc/~ 低端 16 262 0.0611 3.73 0.228
## 3 Re:[新聞]蔡英文:核四是危險拼~ https://www.ptt.cc/~ 拼裝 35 565 0.0619 3.41 0.211
## 4 [新聞]天道盟揪團搶300億核電工~ https://www.ptt.cc/~ 黑道 30 776 0.0387 5.40 0.209
## 5 [新聞]反核四!民進黨要求簽承諾書~ https://www.ptt.cc/~ 宜蘭 22 510 0.0431 4.30 0.186
## 6 [問卦]太陽能其實也算是核電吧?~ https://www.ptt.cc/~ 核融合~ 19 313 0.0607 2.97 0.181
## 7 [問卦]所以核四到底484拼裝車<U+0447>~ https://www.ptt.cc/~ 拼裝 17 332 0.0512 3.41 0.175
## 8 Re:[新聞]蔡英文:核四是危險拼~ https://www.ptt.cc/~ 一步到位~ 16 528 0.0303 5.40 0.164
## 9 Re:[問卦]核廢料補助1桶254~ https://www.ptt.cc/~ 直徑 18 654 0.0275 5.81 0.160
## 10 [新聞]憂藻礁公投缺電工商團體籲政~ https://www.ptt.cc/~ 工商 16 548 0.0292 5.40 0.158
black_data_words_tf_idf %>%
group_by(artUrl) %>%
filter(n>15)%>%
slice_max(tf_idf, n=5) %>%
arrange(desc(tf_idf))%>%
head(10)## # A tibble: 10 x 8
## # Groups: artUrl [10]
## artTitle artUrl word n total tf idf tf_idf
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 [轉錄]藻礁與核四:能源公投與不可~ https://www.ptt.cc/~ 偏好 35 898 0.0390 5.60 0.218
## 2 [新聞]核四落腳台中?民進黨團突襲~ https://www.ptt.cc/~ 新竹 36 741 0.0486 3.81 0.185
## 3 Re:[討論]以核養綠?核電廠不耗~ https://www.ptt.cc/~ 海水 16 401 0.0399 3.81 0.152
## 4 Re:[討論]還是重啟核四吧~ https://www.ptt.cc/~ 完成 18 429 0.0420 2.51 0.105
## 5 Re:[新聞]駁斥核四在台中重啟馬~ https://www.ptt.cc/~ 新竹 17 717 0.0237 3.81 0.0903
## 6 [新聞]學者建議核四移至台中盧秀燕~ https://www.ptt.cc/~ 台中 19 343 0.0554 1.61 0.0894
## 7 [新聞]核四遷台中? 盧秀燕:我跟~ https://www.ptt.cc/~ 台中 18 369 0.0488 1.61 0.0787
## 8 [新聞]議員籲對核四商轉表態柯文哲~ https://www.ptt.cc/~ 柯文哲~ 18 432 0.0417 1.89 0.0787
## 9 Fw:[新聞]2050年零碳排!比~ https://www.ptt.cc/~ 比爾蓋茲~ 36 2247 0.0160 4.91 0.0786
## 10 [新聞]國民黨:民進黨別騙人民提出~ https://www.ptt.cc/~ 土條 19 422 0.0450 1.69 0.0761
gossip_data_words_tf_idf %>%
group_by(artUrl) %>%
slice_max(tf_idf, n=10) %>%
ungroup() %>%
count(word, sort=TRUE)%>%
head(10)## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 公投 24
## 2 柯文哲 18
## 3 表態 17
## 4 核廢料 17
## 5 處理 16
## 6 你家 14
## 7 林義雄 14
## 8 補助 14
## 9 太陽能 13
## 10 封存 13
black_data_words_tf_idf %>%
group_by(artUrl) %>%
slice_max(tf_idf, n=10) %>%
ungroup() %>%
count(word, sort=TRUE)%>%
head(10)## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 柯文哲 14
## 2 巧芯 10
## 3 台中 9
## 4 侯友宜 9
## 5 馬英九 9
## 6 國民黨 9
## 7 綠能 9
## 8 縣市 9
## 9 公投 8
## 10 核廢料 8
jieba_tokenizer = worker(user="自建字典.txt", stop_word = "stop_words.txt")
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}gossip_data_bigram <- gossip_data %>%
unnest_tokens(bigram, sentence, token = jieba_bigram) %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
count(bigram, sort = TRUE)
head(gossip_data_bigram,10)## bigram n
## 1: 核廢料 放 312
## 2: 放 你家 240
## 3: 重啟 核四 216
## 4: 台灣 價值 194
## 5: 時空 背景 186
## 6: 放 我家 171
## 7: 中共 同路人 167
## 8: 能源 政策 165
## 9: 台灣 人 161
## 10: 非核 家園 133
black_data_bigram <- black_data %>%
unnest_tokens(bigram, sentence, token = jieba_bigram) %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
count(bigram, sort = TRUE)
head(black_data_bigram,10)## bigram n
## 1: 重啟 核四 182
## 2: 核四 公投 115
## 3: 能源 政策 99
## 4: 核廢料 放 90
## 5: 核四 重啟 81
## 6: 台灣 人 65
## 7: 非核 家園 61
## 8: 擁核 派 60
## 9: 支持 核四 51
## 10: 藻礁 公投 44
gossip_data_trigram <- gossip_data %>%
unnest_tokens(ngrams, sentence, token=jieba_trigram)
gossip_data_trigram %>%
filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
count(ngrams, sort = TRUE)%>%
head(10)## ngrams n
## 1: 核廢料 放 你家 130
## 2: 核廢料 放 我家 45
## 3: 時空 背景 之術 39
## 4: 抗 中保 台 38
## 5: 重啟 核四 公投 27
## 6: 國家 能源 政策 25
## 7: 廢氣 排 你家 24
## 8: 支持 重啟 核四 22
## 9: 問 台北 市長 18
## 10: 說 核廢料 放 18
black_data_trigram <- black_data %>%
unnest_tokens(ngrams, sentence, token=jieba_trigram)
black_data_trigram %>%
filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
count(ngrams, sort = TRUE)%>%
head(10)## ngrams n
## 1: 核廢料 放 我家 19
## 2: 支持 核四 重啟 16
## 3: 重啟 核四 公投 14
## 4: 愚人 問題 智者 14
## 5: 核廢料 放 你家 13
## 6: 核四 重啟 公投 11
## 7: 新北 市長 侯友宜 11
## 8: 說 核廢料 放 11
## 9: 公投 綁 大選 10
## 10: 台北 市長 柯文哲 10
## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## # A tibble: 15,092,176 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 核四 台灣 138
## 2 台灣 核四 138
## 3 核四 反核 131
## 4 反核 核四 131
## 5 問題 核四 129
## 6 核四 問題 129
## 7 核四 公投 126
## 8 公投 核四 126
## 9 台灣 反核 124
## 10 反核 台灣 124
## # ... with 15,092,166 more rows
gossip_word_pairs <- gossip_data_words %>%
pairwise_count(word, artUrl, sort = TRUE)
gossip_word_pairs## # A tibble: 37,895,608 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 台灣 核四 303
## 2 核四 台灣 303
## 3 核能 台灣 297
## 4 台灣 核能 297
## 5 台灣 問題 292
## 6 問題 台灣 292
## 7 反核 核四 290
## 8 核四 反核 290
## 9 反核 台灣 283
## 10 台灣 反核 283
## # ... with 37,895,598 more rows
gossip_word_cors <- gossip_data_words %>%
group_by(word) %>%
filter(n() >= 10) %>%
pairwise_cor(word, artUrl, sort = TRUE)
gossip_word_cors ## # A tibble: 4,741,506 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 吸滿 吸好 0.876
## 2 吸好 吸滿 0.876
## 3 裝睡 不醒 0.856
## 4 不醒 裝睡 0.856
## 5 溫室 氣體 0.826
## 6 氣體 溫室 0.826
## 7 背景 時空 0.826
## 8 時空 背景 0.826
## 9 非核 家園 0.814
## 10 家園 非核 0.814
## # ... with 4,741,496 more rows
black_word_cors <- black_data_words %>%
group_by(word) %>%
filter(n() >= 10) %>%
pairwise_cor(word, artUrl, sort = TRUE)
black_word_cors ## # A tibble: 965,306 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 非核 家園 0.881
## 2 家園 非核 0.881
## 3 質詢 苗博雅 0.760
## 4 苗博雅 質詢 0.760
## 5 時空 背景 0.750
## 6 背景 時空 0.750
## 7 松山 信義 0.742
## 8 信義 松山 0.742
## 9 絕食 林義雄 0.666
## 10 林義雄 絕食 0.666
## # ... with 965,296 more rows
gossip_word_cors %>%
mutate(item1=gsub("不支持","反對",item1) )%>%
filter(item1 %in% c("支持", "反對")) %>%
filter(!item2 %in% c("支持", "不支持","反對")) %>%
group_by(item1) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation,fill=item1)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text(family = ""))## Selecting by correlation
八卦版中反對多為民眾黨相關事物,因為民眾黨明確表示反對核四運轉公投
支持的多與此次公投或政策相關,如公投、政策、立場等,並且較多辱罵詞,如低能、智障、可憐
black_word_cors %>%
mutate(item1=gsub("不支持","反對",item1) )%>%
filter(item1 %in% c("支持", "反對")) %>%
filter(!item2 %in% c("支持", "不支持","反對")) %>%
group_by(item1) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation,fill=item1)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text(family = ""))## Selecting by correlation
政黑版中涵蓋許多部份,如民進黨、侯友宜等明確表示反核四運轉,
支持的多與公投或政策相關,如公投、反核、缺電等,
gossip_word_cors %>%
filter(item1 %in% c("民進黨", "國民黨","民眾黨")) %>%
group_by(item1) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation,fill=item1)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text(family = "")) #加入中文字型設定,避免中文字顯示錯誤。## Selecting by correlation
black_word_cors %>%
filter(item1 %in% c("民進黨", "國民黨","民眾黨")) %>%
group_by(item1) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation,fill=item1)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text(family = "")) #加入中文字型設定,避免中文字顯示錯誤。## Selecting by correlation
set.seed(2020)
gossip_word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
前總統-絕食抗議:馬總統任內因林義雄絕食抗議停建核四
巧芯-她家:挺核四重啟台北市國民黨議員徐巧芯於臉書留言「(核廢料)放我家的話OK!」
馬斯克-太空:馬斯克是太空技術探索公司 SpaceX創辦人,因多次提出移民火星的計畫,因此鄉民經常說請馬斯克將核廢料送往太空即可解決核廢料問題
林義雄-絕食-聖人-餓死:挺核鄉民會嘲諷林義雄是聖人、當初應該讓林義雄餓死等
gossip_word_cors %>%
filter(correlation > 0.6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
相關性大於0.6的多是鄉民常用之語,如「吸好吸滿」、「中共同路人」、「非核家園」等
而其中煙囪-窗戶是指鄉民常酸反核之人「不支持核電的話,火力煙囪通你家窗戶」
另外提及苗博雅的相關詞彙多是在4/9~4/12苗博雅議員於議會質詢柯文哲等相關議題,而柯文者回覆苗博雅「愚人的問題,智者無法回答」
set.seed(2020)
black_word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
政黑版多是探討核電相關或政治相關詞彙,如討論「風電、風力、太陽能」「乾式貯存」等
而與八卦版比較起來,如林義雄只有絕食與它相關,無餓死、聖人等詞彙
苗博雅議題也多是在與柯文哲相關,但無「愚人-智者」等詞
black_word_cors %>%
filter(correlation > 0.6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
與八卦版比起來,政黑版教討論核電相關事務,多出了「電網-基載」「低階、高階」(核廢料)等
我們發現
針對目前執政黨反核立場,八卦版鄉民較持反對意見,當有執政黨談及反核、核廢料相關議題,八卦版之負面情緒會明顯大於正面情緒
政黑版同樣主題文章與留言較單純,多數圍繞在核四、能源的議題,較不會有反諷、辱罵用詞,並且較常針對政治人物和黨派進行討論
八卦版內容較複雜,並且具有許多廢文和辱罵用詞,若想認真探討一件政治相關議題,較不建議在八卦版做搜尋
有民眾黨、柯文哲對於核能的相關議題皆能在兩版引起熱烈討論