系統參數設定
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼## [1] ""
安裝需要的packages
# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)讀進library
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
require(jiebaR)以愛莉莎莎肝膽排石事件,分析ptt上網友的相關討論。本次主要針對以下方向分析:
1.愛莉莎莎肝膽排石事件的討論大概出現在哪個時間點,話題高峰在哪裡? 2.正面和負面的討論內容各是甚麼,有沒有時間點上的差異? 3.正面和負面討論的情緒分數大約多少?
# 把文章和留言讀進來
MetaData = fread('../data/ptt_articleMetaData.csv',encoding = 'UTF-8')
Reviews = fread('../data/ptt_articleReviews.csv',encoding = 'UTF-8')
# 再篩一次文章
keywords = c('愛莉莎莎','肝膽排石','蒼藍鴿')
toMatch = paste(keywords,collapse="|")
MetaData = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])
# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")(1). 文章斷詞
設定斷詞引擎
# 加入自定義的字典
jieba_tokenizer <- worker(user="../dict/user_dict.txt", stop_word = "../dict/stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}# 把文章和留言的斷詞結果併在一起
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) (2). 資料基本清理
# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
data_select = data %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# 算每天不同字的詞頻
# word_count:artDate,word,count
word_count <- data_select %>%
select(artDate,word) %>%
group_by(artDate,word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>5) %>% # 過濾出現太少次的字
arrange(desc(count))## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
word_count## # A tibble: 3,227 x 3
## # Groups: artDate [125]
## artDate word count
## <date> <chr> <int>
## 1 2021-02-13 醫生 398
## 2 2021-02-13 影片 365
## 3 2021-02-13 莎莎 355
## 4 2021-02-13 愛莉莎莎 352
## 5 2021-02-14 愛莉莎莎 262
## 6 2021-03-01 公務員 211
## 7 2021-02-14 影片 207
## 8 2021-02-13 血流成河 197
## 9 2021-02-14 道歉 196
## 10 2021-02-13 蒼藍鴿 193
## # ... with 3,217 more rows
全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版 分為正向情緒與負向情緒
讀檔,字詞間以“,”將字分隔
P <- read_file("../dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("../dict/liwc/negative.txt") # 負向字典txt檔
#字典txt檔讀進來是一整個字串
typeof(P)## [1] "character"
分割字詞,並將兩個情緒字典併在一起
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
#(LIWC)在畫出情緒之前,先看看每天的發文情形,大約在2/12之後才有較多的討論。
MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot()+
geom_line(aes(x=artDate,y=count))+
scale_x_date(labels = date_format("%m/%d"))找出文集中,對於LIWC字典是positive和negative的字
算出每天情緒總和(sentiment_count)
# sentiment_count:artDate,sentiment,count
sentiment_count = data_select %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) ## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
畫出每天的情緒總分數,可以看到大概在2/12後,短短的幾天內,情緒從正面為主轉為負面為主。並在13號討論最熱烈。約在22號之後討論度逐漸下降。
# 檢視資料的日期區間
range(sentiment_count$artDate) #"2021-02-08" "2021-02-22"## [1] "2020-11-01" "2021-03-24"
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-08','2021-02-22'))
)+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-12'))
[1]])),colour = "red") ## Warning: Removed 256 row(s) containing missing values (geom_path).
將情緒分數標準化後再畫一次圖,約在2/12後負面情緒佔比較高,但是到2/19後正負情緒交替震盪。
愛莉莎莎發布相關影片時間:2/12 不忍了!正式回應蒼藍鴿對我的各種指責 、2/14道歉影片
蒼藍鴿發布相關影片時間:1/1喝橄欖油排膽結石?「肝膽排石法」騙局破解!、2/21正式回覆愛莉莎莎 [肝膽排石法]
sentiment_count %>%
# 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-08','2021-02-22'))
)+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-12'))
[1]])),colour = "red")+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-19'))
[1]])),colour = "red")## Warning: Removed 256 row(s) containing missing values (geom_path).
由於蒼藍鴿發布回覆愛莉莎莎的影片,因此推測2/21文章較正面
# 查看每天的情緒分數排名
sentiment_count %>%
select(count,artDate) %>%
group_by(artDate) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))## # A tibble: 140 x 2
## artDate sum
## <date> <int>
## 1 2021-02-13 2790
## 2 2021-02-14 1581
## 3 2021-02-15 692
## 4 2021-03-01 532
## 5 2021-02-17 514
## 6 2021-02-12 488
## 7 2021-02-16 444
## 8 2021-02-19 314
## 9 2021-03-02 297
## 10 2021-01-04 180
## # ... with 130 more rows
先從2021-02-13的情緒高點看起,呼應上面負面的情緒分析,出現「可憐」、「打臉」等詞彙。推測是因許多網友認為應該聽從醫師建議,而不是非醫學專業人員,並希望愛莉莎莎出來道歉。
# 畫出文字雲
word_count %>%
filter(!(word %in% c("愛莉莎莎","莎莎"))) %>%
filter(artDate == as.Date('2021-02-13')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>50) %>% # 過濾出現太少次的字
wordcloud2()## Adding missing grouping variables: `artDate`
2021-02-14愛莉莎莎道歉影片發布,呼應上面負面的情緒分析,依然有需多人感到不滿,甚至出現「死不認錯」、「台女」等詞彙。推測是因許多網友認為道歉態度可能不佳,還牽涉愛莉莎莎的學歷(清大),也討論影片流量的問題。
# 畫出文字雲
plot_0214 =
word_count %>%
filter(!(word %in% c("愛莉莎莎","莎莎"))) %>%
filter(artDate == as.Date('2021-02-14')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>40) # 過濾出現太少次的字## Adding missing grouping variables: `artDate`
算出所有字詞的詞頻(sentiment_sum),找出情緒代表字
# sentiment_sum:word,sentiment,sum
sentiment_sum <-
word_count %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()sentiment_sum %>%
acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"), # positive negative
max.words = 50)可以依據不同日期觀察情緒代表字的變化
sentiment_sum_select <-
word_count %>%
filter(artDate == as.Date('2021-02-13')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum_select %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment 0213",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()sentiment_sum_select %>%
acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"), # positive negative
max.words = 50)# 依據情緒值的正負比例歸類文章
article_type =
data_select %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=n()) %>%
spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
mutate(type = case_when(positive > negative ~ "positive",
TRUE ~ "negative")) %>%
data.frame() ## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
# 看一下正負比例的文章各有幾篇
article_type %>%
group_by(type) %>%
summarise(count = n())## # A tibble: 2 x 2
## type count
## <chr> <int>
## 1 negative 722
## 2 positive 485
#
article_type_date = left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")
article_type_date %>%
group_by(artDate,type) %>%
summarise(count = n()) %>%
ggplot(aes(x = artDate, y = count, fill = type)) +
geom_bar(stat = "identity", position = "dodge")+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-12','2021-02-22'))
)## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## Warning: Removed 246 rows containing missing values (geom_bar).
把正面和負面的文章挑出來,並和斷詞結果合併。
# negative_article:artUrl,word
negative_article <-
article_type %>%
filter(type=="negative")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
# positive_article:artUrl,word
positive_article <-
article_type %>%
filter(type=="positive")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")畫出正負面文章情緒貢獻度較高的關鍵字
# 負面情緒關鍵字貢獻圖
negative_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to negative sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
# 正面情緒關鍵字貢獻圖
positive_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to positive sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
從正負面情緒圖觀察發現,正面和負面的關鍵字沒有甚麼顯著的差異,負面情緒較高的文章比較常出現「可憐」、「問題」、「噁心」等討論對愛莉莎莎的負面影響的字詞;正面情緒較高的文章出現較多的「相信」、「可愛」、「喜歡」等字詞,但相信有可能是指相信醫生的部分,而不是相信愛莉莎莎。