系統參數設定
## [1] ""
安裝需要的packages
# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales','plotly' ,'googleVis', 'readr', 'ggpubr' ,'highcharter')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)讀進library
#可不讓資訊跑出來
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
require(jiebaR)
library(readr)
library(plotly)
library(googleVis)
library(readr)
library(ggpubr)
library(highcharter)此篇將以中天關台作為主題,看ptt鄉民的討論如何,而以下將針對幾點作分析:
1.中天關台的訊息在什麼時候討論度最高? 2.鄉民對中天關台的意見是甚麼? 3.鄉民正面和負面討論的情緒如何?
##
## -- Column specification --------------------------------------------------------
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
##
## -- Column specification --------------------------------------------------------
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## cmtPoster = col_character(),
## cmtStatus = col_character(),
## cmtDate = col_datetime(format = ""),
## cmtContent = col_character()
## )
(1). 文章斷詞
設定斷詞引擎
# 加入自定義的字典
jieba_tokenizer <- worker(user="D:/W5/user_dict_52.txt", stop_word ="D:/W5//stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}# 把文章和留言的斷詞結果併在一起
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) (2). 資料基本清理
# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
data_select = data %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# 算每天不同字的詞頻
# word_count:artDate,word,count
word_count <- data_select %>%
select(artDate,word) %>%
group_by(artDate,word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## # A tibble: 20,514 x 3
## # Groups: artDate [145]
## artDate word count
## <date> <chr> <int>
## 1 2020-11-18 中天 2295
## 2 2020-11-18 新聞 1031
## 3 2020-10-26 中天 941
## 4 2020-11-18 三立 892
## 5 2020-12-11 中天 853
## 6 2020-11-19 中天 790
## 7 2020-12-13 中天 774
## 8 2020-12-12 中天 700
## 9 2020-11-18 崩潰 638
## 10 2020-10-25 中天 595
## # ... with 20,504 more rows
全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版 分為正向情緒與負向情緒
讀檔,字詞間以“,”將字分隔
P <- read_file("D:/W5/positive.txt") # 正向字典txt檔
N <- read_file("D:/W5/negative.txt") # 負向字典txt檔
#字典txt檔讀進來是一整個字串
typeof(P)## [1] "character"
分割字詞,並將兩個情緒字典併在一起
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
首先看看2020有關中天關台之發文數
MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot(aes(x=artDate, y=count)) +
geom_point() +
geom_line() +
ggtitle("2020有關中天關台之發文數") + xlab("date (yr'mn)") +
scale_x_date(date_breaks="1 months", date_labels="%m/%d")->post_plot
ggplotly(post_plot)找出文集中,對於LIWC字典是positive和negative的字
算出每天情緒總和(sentiment_count)
# sentiment_count:artDate,sentiment,count
sentiment_count = data_select %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) ## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
再來對ptt鄉民對中天關台的情緒做分析,經上圖可以看到,在2020-10-1後,中天關台的討論度開始攀升,以下將針對2020-10-1後的文章做情緒分析
## [1] "2020-01-02" "2020-12-30"
sentiment_count %>%
ggplot(aes(x=artDate,y=count,colour=sentiment))+
geom_point() +
geom_line()+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2020-10-01','2020-12-31'))
)->sentiment_plot
ggplotly(sentiment_plot)標準化後看情緒的波動
sentiment_count %>%
# 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot(aes(x=artDate,y=ratio,colour=sentiment))+
geom_line()+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2020-10-01','2020-12-31'))
)->recount_sentiment
ggplotly(recount_sentiment)# 查看每天的情緒分數排名
sentiment_count %>%
select(count,artDate) %>%
group_by(artDate) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))## # A tibble: 158 x 2
## artDate sum
## <date> <int>
## 1 2020-11-18 5579
## 2 2020-12-11 2290
## 3 2020-11-19 2275
## 4 2020-10-26 1957
## 5 2020-12-13 1822
## 6 2020-12-12 1741
## 7 2020-10-25 1252
## 8 2020-10-15 924
## 9 2020-10-14 856
## 10 2020-01-11 849
## # ... with 148 more rows
先對討論度最高的2020-11-18看看鄉民們對中天關台的評價如何,而在上一節的情緒分析中,可以看到2020-11-18當天是負面情緒多於正面情緒的
word_count %>%
filter(!(word %in% c("中天","新聞","換照"))) %>%
filter(artDate == as.Date('2020-11-18')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>100) %>%
# 過濾出現太少次的字
wordcloud2()## Adding missing grouping variables: `artDate`
經文字雲看2020-11-18的結果,在過濾掉「中天」及「新聞」二字後,出現了「三立」、「崩潰」、「台灣」、「開心」、「綠共」等字,可以看到大多數人持正面意見,而關於「自由」的字眼也非常多。
較為意外的是,「三立」電視台的字眼出現的非常頻繁,推測是有另一派的鄉民希望執政黨不要雙標看待事情,對於較親民進黨派的三立電視台也要用同樣標準審查。
接下來看看2020-11-19鄉民的普遍反應如何,在前一節的情緒分析中,鄉民的情緒是負面大於正面的
word_count %>%
filter(!(word %in% c("中天","新聞"))) %>%
filter(artDate == as.Date('2020-11-19')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>20
) %>%# 過濾出現太少次的字
wordcloud2()## Adding missing grouping variables: `artDate`
接著對2020-12-11~2020-12-13進行分析
word_count %>%
filter(!(word %in% c("中天","新聞"))) %>%
filter(artDate == as.Date('2020-12-11~2020-12-13')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>20) %>% # 過濾出現太少次的字
wordcloud2()## Adding missing grouping variables: `artDate`
* 可以看到2020-12-11~2020-12-13這幾天大多數出現的關鍵字與前面分析的結果相似,關台為主要鄉民討論的部分
算出所有字詞的詞頻(sentiment_sum),找出情緒代表字
# sentiment_sum:word,sentiment,sum
sentiment_sum <-
word_count %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip() ### 正負情緒文字雲
sentiment_sum %>%
acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"), # positive negative
max.words = 50) ### 2020-11-18 正負情緒代表字
sentiment_sum_select <-
word_count %>%
filter(artDate == as.Date('2020-11-18')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum_select %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "2020-11-18 正負面代表字",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()之前的情緒分析大部分是全部的詞彙加總,接下來將正負面情緒的文章分開,看看能不能發現一些新的東西。接下來歸類文章,將每一篇文章正負面情緒的分數算出來,然後大概分類文章屬於正面還是負面。
# 依據情緒值的正負比例歸類文章
article_type =
data_select %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=n()) %>%
spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
mutate(type = case_when(positive > negative ~ "positive",
TRUE ~ "negative")) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
## # A tibble: 2 x 2
## type count
## <chr> <int>
## 1 negative 996
## 2 positive 712
接下來對2020-11-18、2020-11-19、2020-12-11~2020-12-13這幾天做正負面文章的統計圖
article_type_date = left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")
article_type_date %>%
group_by(artDate,type) %>%
summarise(count = n())%>%
filter(artDate == as.Date('2020-11-18'))%>%
ggplot(aes(x = artDate, y = count, fill = type))+
geom_bar(stat = "identity",position=position_dodge())->d1## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
article_type_date %>%
group_by(artDate,type) %>%
summarise(count = n())%>%
filter(artDate == as.Date('2020-11-19'))%>%
ggplot(aes(x = artDate, y = count, fill = type))+
geom_bar(stat = "identity",position=position_dodge())->d2## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
article_type_date %>%
group_by(artDate,type) %>%
summarise(count = n())%>%
filter(artDate == as.Date('2020-12-11~2020-12-13'))%>%
ggplot(aes(x = artDate, y = count, fill = type))+
geom_bar(stat = "identity",position=position_dodge())->d3## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
接著把正面和負面的文章挑出來,並和斷詞結果合併。
# negative_article:artUrl,word
negative_article <-
article_type %>%
filter(type=="negative")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
# positive_article:artUrl,word
positive_article <-
article_type %>%
filter(type=="positive")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")畫出正負面文章情緒貢獻度較高的關鍵字
# 負面情緒關鍵字貢獻圖
negative_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to negative sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
# 正面情緒關鍵字貢獻圖
positive_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to positive sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
# 加入dcard資料作比較
Dcard = fread('D:/W5/dcard_52_articleMetaData.csv',encoding = 'UTF-8')
DToken <- Dcard %>% unnest_tokens(word, sentence, token=customized_tokenizer)
PTT_Token <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")])
PTT_Token = PTT_Token %>% mutate(source = "ptt")
Dcard_Token = DToken %>% mutate(source = "dcard")
# 把資料併在一起
data_combine = rbind(PTT_Token,Dcard_Token[,c("artDate","artUrl", "word","source")])
data_combine$artDate= data_combine$artDate %>% as.Date("%Y/%m/%d")## [1] "2020/01/02" "2020/12/26"
data_combine %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment,source) %>%
summarise(count = n()) %>%
filter(artDate>='2020-10-01') %>%
# 畫圖的部分
ggplot(aes(x= artDate,y=count,fill=sentiment)) +
scale_color_manual() +
geom_col(position="dodge") +
scale_x_date(labels = date_format("%m/%d")) +
labs(title = "ptt & dcard 的情緒分數",color = "情緒類別") +
facet_wrap(~source, ncol = 1, scales="free_y")->compare_plot # scale可以調整比例尺## Joining, by = "word"
## `summarise()` has grouped output by 'artDate', 'sentiment'. You can override using the `.groups` argument.
來看看2020-11-18Dcard大多以什麼樣的字眼評論中天換照事件
Dcard_Token$artDate= Dcard_Token$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
data_select_Dcard = Dcard_Token %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
word_count_dcard <- data_select_Dcard %>%
select(artDate,word) %>%
group_by(artDate,word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## # A tibble: 527 x 3
## # Groups: artDate [46]
## artDate word count
## <date> <chr> <int>
## 1 2020-11-18 中天 118
## 2 2020-11-18 新聞 68
## 3 2020-11-18 換照 39
## 4 2020-11-19 中天 39
## 5 2020-11-04 中天 33
## 6 2020-11-19 新聞 29
## 7 2020-09-25 新聞 26
## 8 2020-10-25 中天 26
## 9 2020-12-11 中天 26
## 10 2020-11-20 中天 25
## # ... with 517 more rows
word_count_dcard %>%
filter(!(word %in% c("中天","新聞"))) %>%
filter(artDate == as.Date('2020-11-18')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
wordcloud2()## Adding missing grouping variables: `artDate`
* 經文字雲可以看到,在2020-11-18,dcard出現頻率較高的文字大多為陳述時會用到的字詞,與其他的分析結果較不同的是出現「耀祥」二字,經搜尋,因是NCC主委陳耀祥,推測是當天NCC駁回中天換照才會不斷提到主委。
1.中天關台的訊息在什麼時候討論度最高?
中天關台的訊息在2020-11-18的討論度最高,推測為NCC在當天決議駁回中天新聞台換照申請引起大家關注,而在2020-11-19的討論度也不低,可能是延續前一天的討論熱度。
再來是2020-12-11~2020-12-13這段時間,中天在2020/12/12零點正式下架,推測是因為中天下架而討論度較高。
2.鄉民正面和負面討論的情緒如何?
依分析結果顯示,大部分天數的情緒為好壞皆半,較為特別的是2020-11-18,鄉民對中天關台的討論為正面情緒大於負面情緒,以及 2020-11-19,負面情緒比正面多,推測是鄉民在收到中天關台的訊息時,第一時間皆是表達正面的字詞,如:中天要關台了好開心!,推論在後面幾天,鄉民有可能加上了比較多負面的字詞來討論,或者是另一派反對的人開始加入討論。
- 鄉民對中天關台的意見是甚麼?
以2020-11-18,借文字雲的顯示,出現了「三立」、「崩潰」、「台灣」、「開心」、「綠共」等字,可以看到大多數人持正面意見,而關於「自由」的字眼也非常多。較意外的是,「三立」電視台的字眼出現的非常頻繁,推測是有另一派的鄉民希望執政黨不要雙標。而相較於Dcard的使用者,ptt的使用者用字遣詞較為具體,如帶有色彩的「紅媒」、「綠共」、「三民自」…,意識形態較為濃厚。