系統參數設定
## [1] ""
安裝需要的packages
# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)install library
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
require(jiebaR)# 把文章和留言讀進來
MetaData = fread('../data/PTT_articleMetaData.csv',encoding = 'UTF-8')
Reviews = fread('../data/PTT_articleReviews.csv',encoding = 'UTF-8')
stockMetaData = fread('../data/PTT_stock_articleMetaData.csv',encoding = 'UTF-8')
stockReviews = fread('../data/PTT_stock_articleReviews.csv',encoding = 'UTF-8')
MetaData = rbind(MetaData, stockMetaData)
Reviews = rbind(Reviews, stockReviews)
# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")(1). 文章斷詞
設定斷詞引擎
# 加入自定義的字典
jieba_tokenizer <- worker(user="../dict/user_dict.txt", stop_word = "../dict/stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}# 把文章和留言的斷詞結果併在一起
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) (2). 資料基本清理
# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
data_select = data %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# 算每天不同字的詞頻
# word_count:artDate,word,count
word_count <- data_select %>%
select(artDate,word) %>%
group_by(artDate,word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## # A tibble: 2,103 x 3
## # Groups: artDate [4]
## artDate word count
## <date> <chr> <int>
## 1 2021-03-25 長榮 432
## 2 2021-03-24 長榮 311
## 3 2021-03-25 運河 271
## 4 2021-03-25 台灣 240
## 5 2021-03-25 塞子 216
## 6 2021-03-25 蘇伊士運河 175
## 7 2021-03-25 日本 140
## 8 2021-03-24 運河 136
## 9 2021-03-25 真的 133
## 10 2021-03-25 埃及 129
## # ... with 2,093 more rows
全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版 分為正向情緒與負向情緒
讀檔,字詞間以“,”將字分隔
P <- read_file("../dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("../dict/liwc/negative.txt") # 負向字典txt檔
#字典txt檔讀進來是一整個字串
typeof(P)## [1] "character"
分割字詞,並將兩個情緒字典併在一起
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
在畫出情緒之前,先看看每天的發文情形,大約在3月25日有最多的討論數。
MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot()+
geom_line(aes(x=artDate,y=count))+
scale_x_date(labels = date_format("%m/%d"))找出文集中,對於LIWC字典是positive和negative的字
算出每天情緒總和(sentiment_count),可以看出23日和26日時情緒為positive較多,其餘則為negative較多。
# sentiment_count:artDate,sentiment,count
sentiment_count = data_select %>%
select(artDate,word) %>%
inner_join(LIWC,by = "word") %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) ## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## # A tibble: 8 x 3
## # Groups: artDate [4]
## artDate sentiment count
## <date> <chr> <int>
## 1 2021-03-23 negative 92
## 2 2021-03-23 positive 98
## 3 2021-03-24 negative 307
## 4 2021-03-24 positive 271
## 5 2021-03-25 negative 586
## 6 2021-03-25 positive 453
## 7 2021-03-26 negative 72
## 8 2021-03-26 positive 76
## [1] "2021-03-23" "2021-03-26"
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-22','2021-03-26'))
)+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-25'))
[1]])),colour = "red") 將情緒分數標準化後再畫一次圖,可以發現雖然正負面情緒有波動,約在3/25為負面情緒高。
sentiment_count %>%
# 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-22','2021-03-26'))
)+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-25'))
[1]])),colour = "red")我們挑出幾個情緒高點的日期 觀察每日情緒分數,25號達到議題高峰
# 查看每天的情緒分數排名
sentiment_count %>%
select(count,artDate) %>%
group_by(artDate) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))## # A tibble: 4 x 2
## artDate sum
## <date> <int>
## 1 2021-03-25 1039
## 2 2021-03-24 578
## 3 2021-03-23 190
## 4 2021-03-26 148
挑出有興趣的日期,畫出文字雲看看都在討論甚麼主題。
## Joining, by = "word"
## # A tibble: 69 x 4
## # Groups: artDate [1]
## artDate word count sentiment
## <date> <chr> <int> <chr>
## 1 2021-03-25 問題 57 negative
## 2 2021-03-25 損失 52 negative
## 3 2021-03-25 重要 33 positive
## 4 2021-03-25 嚴重 30 negative
## 5 2021-03-25 幫忙 29 positive
## 6 2021-03-25 不爽 26 negative
## 7 2021-03-25 解決 26 positive
## 8 2021-03-25 謝謝 25 positive
## 9 2021-03-25 喜歡 21 positive
## 10 2021-03-25 可憐 15 negative
## # ... with 59 more rows
# 畫出文字雲
word_count %>%
filter(!(word %in% c("台灣","長榮","運河","真的"))) %>%
filter(artDate == as.Date('2021-03-25')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>20) %>% # 過濾出現太少次的字
wordcloud2()## Adding missing grouping variables: `artDate`
## Joining, by = "word"
## # A tibble: 40 x 4
## # Groups: artDate [1]
## artDate word count sentiment
## <date> <chr> <int> <chr>
## 1 2021-03-24 問題 42 negative
## 2 2021-03-24 解決 20 positive
## 3 2021-03-24 協助 16 positive
## 4 2021-03-24 幫忙 13 positive
## 5 2021-03-24 簡單 12 positive
## 6 2021-03-24 重要 11 positive
## 7 2021-03-24 謝謝 11 positive
## 8 2021-03-24 丟臉 10 negative
## 9 2021-03-24 損失 10 negative
## 10 2021-03-24 可憐 9 negative
## # ... with 30 more rows
## Joining, by = "word"
## # A tibble: 12 x 4
## # Groups: artDate [1]
## artDate word count sentiment
## <date> <chr> <int> <chr>
## 1 2021-03-23 謝謝 20 positive
## 2 2021-03-23 問題 15 negative
## 3 2021-03-23 成功 6 positive
## 4 2021-03-23 故障 6 negative
## 5 2021-03-23 喜歡 6 positive
## 6 2021-03-23 垃圾 5 negative
## 7 2021-03-23 嚴重 5 negative
## 8 2021-03-23 完美 4 positive
## 9 2021-03-23 感謝 4 positive
## 10 2021-03-23 解決 4 positive
## 11 2021-03-23 幫忙 4 positive
## 12 2021-03-23 簡單 4 positive
算出所有字詞的詞頻(sentiment_sum),找出情緒代表字,負面詞包含問題、損失、嚴重;正面詞包含解決、幫忙…等。
# sentiment_sum:word,sentiment,sum
sentiment_sum <-
word_count %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()另外一種呈現方式
sentiment_sum %>%
acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"), # positive negative
max.words = 100)另外,也可以依據不同日期觀察情緒代表字的變化
sentiment_sum_select <-
word_count %>%
filter(artDate == as.Date('2021-03-25')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum_select %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment 0325",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()之前的情緒分析大部分是全部的詞彙加總,接下來將正負面情緒的文章分開,看看能不能發現一些新的東西。接下來歸類文章,將每一篇文章正負面情緒的分數算出來,然後大概分類文章屬於正面還是負面。 分析結果為負面文章的比例較多。
# 依據情緒值的正負比例歸類文章
article_type =
data_select %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=n()) %>%
spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
mutate(type = case_when(positive > negative ~ "positive",
TRUE ~ "negative")) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
## # A tibble: 2 x 2
## type count
## * <chr> <int>
## 1 negative 100
## 2 positive 70
#
article_type_date = left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")
article_type_date %>%
group_by(artDate,type) %>%
summarise(count = n()) %>%
ggplot(aes(x = artDate, y = count, fill = type)) +
geom_bar(stat = "identity", position = "dodge")+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-22','2021-03-26'))
)## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## Warning: Removed 1 rows containing missing values (geom_bar).
把正面和負面的文章挑出來,並和斷詞結果合併。
# negative_article:artUrl,word
negative_article <-
article_type %>%
filter(type=="negative")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
# positive_article:artUrl,word
positive_article <-
article_type %>%
filter(type=="positive")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")畫出正負面文章情緒貢獻度較高的關鍵字
# 負面情緒關鍵字貢獻圖
negative_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to negative sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
# 正面情緒關鍵字貢獻圖
positive_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to positive sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
結合聯合新聞網的資料,計算每天正負面情緒的數量,發現兩個平台的負面情緒都較高,尤其3月25日特別明顯。
news = fread('../data/news_articleMetaData.csv',encoding = 'UTF-8')
newsToken <- news %>% unnest_tokens(word, sentence, token=customized_tokenizer)
PToken = Reviews %>% unnest_tokens(word,sentence,token=customized_tokenizer)
PTT_Token = PToken %>% mutate(source = "ptt")
news_Token = newsToken %>% mutate(source = "news")
data_combine = rbind(PTT_Token[,c("artDate","artUrl","word","source")],news_Token[,c("artDate","artUrl", "word","source")])
data_combine$artDate= data_combine$artDate %>% as.Date("%Y/%m/%d")
range(news$artDate) ## [1] "2021/03/24" "2021/03/26"
data_combine %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment,source) %>%
summarise(count = n()) %>%
filter(artDate>='2021-03-23') %>%
# 畫圖的部分
ggplot(aes(x= artDate,y=count,fill=sentiment)) +
scale_color_manual() +
geom_col(position="dodge") +
scale_x_date(labels = date_format("%m/%d")) +
labs(title = "sentiment of PTT & 聯合新聞網",color = "情緒類別") +
facet_wrap(~source, ncol = 1, scales="free_y") # scale可以調整比例尺## Joining, by = "word"
## `summarise()` has grouped output by 'artDate', 'sentiment'. You can override using the `.groups` argument.
長榮海運「長賜輪」(Ever Given)23日上午擱淺卡道蘇伊士運河,嚴重影響全球船運時程,不過這起「航運界重大意外」與「奇談」也引起全球國際媒體大幅報導,本組抓取PTT八卦版有關長榮海運此事件,23日開始被討論在25日討論最為熱烈。 從23日文字雲出現塞子、世界之光等文字,符合韓國瑜的「塞子說」及網友附和的「台灣第一次影響全世界」,長榮事件討論度越來越高,更登上國際版面,意外讓台灣出名了,以至於一開始的情緒分析為positive。 由25日的文字雲來看因為造成運河壅塞,長榮後續可能會被罰錢,而大家關心的咎責賠償問題及長賜號屬日本正榮汽船公司都是被大家廣為討論的。
從正負面情緒文章數看出負面文章比例較多,負面情緒關鍵字多為問題、損失、嚴重;正面情緒關鍵字為解決、重要、幫忙等 ,也符合上述文字雲中大家討論的觀點。
本組也比較PTT及聯合新聞網中每天文章的正負情緒變化,從圖可知25號是討論度最多及情緒轉折的一天。