系統參數設定
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] ""
安裝需要的packages
# echo = T,results = 'hide'
= c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
packages = as.character(installed.packages()[,1])
existing for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
讀進library
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
require(jiebaR)
這次我們以最近發生的鮭魚之亂事件,主要分析ptt上網友的相關討論,並對比dcard上大家的討論情形。本次主要針對以下方向分析:
1.鮭魚之亂的討論大概出現在哪個時間點,話題高峰在哪裡? 2.正面和負面的討論內容各是甚麼,有沒有時間點上的差異? 3.正面和負面討論的情緒分數大約多少?
ptt有些文章有「改名」關鍵字,但和主題不相關,篩選文章必須要有「鮭」和其他關鍵字。
# 把文章和留言讀進來
= fread('../data/ptt_articleMetaData.csv',encoding = 'UTF-8')
MetaData = fread('../data/ptt_articleReviews.csv',encoding = 'UTF-8')
Reviews
# 再篩一次文章 826 篇
= c('鮭','免費壽司','免費吃壽司','免費的壽司','壽司郎')
keywords = paste(keywords,collapse="|")
toMatch = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])
MetaData
# 挑選文章對應的留言
= left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl") Reviews
(1). 文章斷詞
設定斷詞引擎
# 加入自定義的字典
<- worker(user="../dict/user_dict.txt", stop_word = "../dict/stop_words.txt")
jieba_tokenizer
# 設定斷詞function
<- function(t) {
customized_tokenizer lapply(t, function(x) {
<- segment(x, jieba_tokenizer)
tokens return(tokens)
}) }
# 把文章和留言的斷詞結果併在一起
<- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
MToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
RToken
# 把資料併在一起
<- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) data
(2). 資料基本清理
# 格式化日期欄位
$artDate= data$artDate %>% as.Date("%Y/%m/%d")
data
# 過濾特殊字元
= data %>%
data_select filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# 算每天不同字的詞頻
# word_count:artDate,word,count
<- data_select %>%
word_count select(artDate,word) %>%
group_by(artDate,word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
word_count
## # A tibble: 9,074 x 3
## # Groups: artDate [12]
## artDate word count
## <date> <chr> <int>
## 1 2021-03-19 鮭魚 1847
## 2 2021-03-17 鮭魚 1822
## 3 2021-03-17 改名 1526
## 4 2021-03-19 改名 1489
## 5 2021-03-18 鮭魚 1260
## 6 2021-03-18 改名 1072
## 7 2021-03-19 台灣 806
## 8 2021-03-19 真的 772
## 9 2021-03-20 鮭魚 564
## 10 2021-03-17 免費 496
## # ... with 9,064 more rows
全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版 分為正向情緒與負向情緒
讀檔,字詞間以“,”將字分隔
<- read_file("../dict/liwc/positive.txt") # 正向字典txt檔
P <- read_file("../dict/liwc/negative.txt") # 負向字典txt檔
N
#字典txt檔讀進來是一整個字串
typeof(P)
## [1] "character"
分割字詞,並將兩個情緒字典併在一起
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
= strsplit(P, ",")[[1]]
P = strsplit(N, ",")[[1]]
N
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
= data.frame(word = P, sentiment = "positive") #664
P = data.frame(word = N, sentiment = "negative") #1047
N
# 把兩個字典拼在一起
= rbind(P, N)
LIWC
# 檢視字典
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
在畫出情緒之前,先看看每天的發文情形,大約在3.15之後才有較多的討論。
$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData%>%
MetaData group_by(artDate) %>%
summarise(count = n()) %>%
ggplot()+
geom_line(aes(x=artDate,y=count))+
scale_x_date(labels = date_format("%m/%d"))
找出文集中,對於LIWC字典是positive和negative的字
算出每天情緒總和(sentiment_count)
# sentiment_count:artDate,sentiment,count
= data_select %>%
sentiment_count select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) #n()的用法是計算有幾個count, sum則是計算某個欄位的sum值
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
畫出每天的情緒總分數,可以看到大概在3/18後,短短的幾天內,情緒從正面為主轉為負面為主。約在20號之後討論度逐漸下降。
# 檢視資料的日期區間
range(sentiment_count$artDate) #"2021-03-03" "2021-03-21"
## [1] "2021-03-03" "2021-03-21"
%>%
sentiment_count ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-10','2021-03-21'))
+
)# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-18'))
1]])),colour = "red") [
## Warning: Removed 3 row(s) containing missing values (geom_path).
將情緒分數標準化後再畫一次圖,可以發現雖然正負面情緒有波動,但大部分正負面情緒各半,約在3/18後負面情緒佔比較高。
%>%
sentiment_count # 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-10','2021-03-21'))
+
)# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-18'))
1]])),colour = "red") [
## Warning: Removed 3 row(s) containing missing values (geom_path).
我們挑出幾個情緒高點的日期 觀察每日情緒分數,約從16號開始議題被大量討論,19達到議題高峰,之後就慢慢下降。
# 查看每天的情緒分數排名
%>%
sentiment_count select(count,artDate) %>%
group_by(artDate) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))
## # A tibble: 12 x 2
## artDate sum
## <date> <int>
## 1 2021-03-19 7156
## 2 2021-03-17 4744
## 3 2021-03-18 3684
## 4 2021-03-20 1868
## 5 2021-03-21 1463
## 6 2021-03-16 1208
## 7 2021-03-15 21
## 8 2021-03-12 13
## 9 2021-03-08 6
## 10 2021-03-10 4
## 11 2021-03-03 2
## 12 2021-03-06 1
挑出有興趣的日期,畫出文字雲看看都在討論甚麼主題。
先從2021-03-19的情緒高點看起,呼應上面負面的情緒分析,出現「浪費」、「貪小便宜」、「丟臉」、「乞丐」等詞彙。推測是因許多網友抨擊改名行為不明智,也批評部分免費吃壽司的人造成食物浪費。
# 畫出文字雲
%>%
word_count filter(!(word %in% c("鮭魚"))) %>%
filter(artDate == as.Date('2021-03-19')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>100) %>% # 過濾出現太少次的字
wordcloud2()
## Adding missing grouping variables: `artDate`
看前後兩天的討論情況
2021-03-17的文字雲,往前看17正面情緒較高的文字雲,發現此時負面批評詞彙較少,出現較多正面詞彙或中性詞彙,如「好笑」、「年輕人」、「行銷」、「廣告」,推測此時負面文章較少,網友對鮭魚事件仍持一種較為幽默的態度,覺得此企劃好笑、年輕人太衝動等。
# 畫出文字雲
= word_count %>%
plot_0317 filter(!(word %in% c("鮭魚"))) %>%
filter(artDate == as.Date('2021-03-17')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>120) %>% # 過濾出現太少次的字
wordcloud2()
## Adding missing grouping variables: `artDate`
plot_0317
2021-03-18的文字雲,18為政負面情緒轉捩點,可以看出此時出現比較多負面詞彙,如「浪費」、「可憐」、「貪小便宜」等。
# 畫出文字雲
= word_count %>%
plot_0318 filter(!(word %in% c("鮭魚"))) %>%
filter(artDate == as.Date('2021-03-18')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>100) %>% # 過濾出現太少次的字
wordcloud2()
plot_0318
#
算出所有字詞的詞頻(sentiment_sum),找出情緒代表字
# sentiment_sum:word,sentiment,sum
<-
sentiment_sum %>%
word_count inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
%>%
) arrange(desc(sum)) %>%
data.frame() #儲存成data frame的格式
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
%>%
sentiment_sum top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>% #重新排序word,
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
另外一種呈現方式
%>%
sentiment_sum acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("purple", "#72bcd4"), # negative positive
max.words = 50)
#salmon
另外,也可以依據不同日期觀察情緒代表字的變化
<-
sentiment_sum_select %>%
word_count filter(artDate == as.Date('2021-03-17')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
%>%
) arrange(desc(sum)) %>%
data.frame()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
%>%
sentiment_sum_select top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment 0317",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
%>%
sentiment_sum_select acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"), # positive negative
max.words = 50)
之前的情緒分析大部分是全部的詞彙加總,接下來將正負面情緒的文章分開,看看能不能發現一些新的東西。接下來歸類文章,將每一篇文章正負面情緒的分數算出來,然後大概分類文章屬於正面還是負面。
# 依據情緒值的正負比例歸類文章
=
article_type %>%
data_select inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=n()) %>%
spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
mutate(type = case_when(positive > negative ~ "positive",
TRUE ~ "negative")) %>%
data.frame()
## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
# 看一下正負比例的文章各有幾篇
%>%
article_type group_by(type) %>%
summarise(count = n())
## # A tibble: 2 x 2
## type count
## <chr> <int>
## 1 negative 435
## 2 positive 513
可以看到在約3/19號之後,負面文章增加較多。
#
= left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")
article_type_date
%>%
article_type_date group_by(artDate,type) %>%
summarise(count = n()) %>%
ggplot(aes(x = artDate, y = count, fill = type)) +
geom_bar(stat = "identity", position = "dodge")+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-03-10','2021-03-21'))
)
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## Warning: Removed 5 rows containing missing values (geom_bar).
把正面和負面的文章挑出來,並和斷詞結果合併。
# negative_article:artUrl,word
<-
negative_article %>%
article_type filter(type=="negative")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
# positive_article:artUrl,word
<-
positive_article %>%
article_type filter(type=="positive")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
畫出正負面文章情緒貢獻度較高的關鍵字
# 負面情緒關鍵字貢獻圖
<-negative_article %>%
ainner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
%>%
)arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum))
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
%>%
negative_article inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
%>%
)arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to negative sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
# 正面情緒關鍵字貢獻圖
%>%
positive_article inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
%>%
)arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to positive sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
從正負面情緒圖觀察發現,正面和負面的關鍵字沒有甚麼顯著的差異,負面情緒較高的文章比較常出現「擔心」、「後悔」、「麻煩」等討論改名負面影響的字詞,比較著重在批評改名行為本身;正面情緒較高的文章出現較多的「免費」、「優惠」、「好玩」等字詞,著重在討論優惠本身。
# 加入dcard資料作比較
= fread('../data/dcard_articleMetaData.csv',encoding = 'UTF-8')
Dcard <- Dcard %>% unnest_tokens(word, sentence, token=customized_tokenizer)
DToken <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")])
PTT_Token
= PTT_Token %>% mutate(source = "ptt")
PTT_Token = DToken %>% mutate(source = "dcard")
Dcard_Token
# 把資料併在一起
= rbind(PTT_Token,Dcard_Token[,c("artDate","artUrl", "word","source")])
data_combine
$artDate= data_combine$artDate %>% as.Date("%Y/%m/%d") #調整日期的格式 data_combine
ptt和dcard的情緒分布直方圖,可以發現dcard相較於ptt正面情緒稍多,話題討論高峰的時間點也大致相同。
range(Dcard$artDate) #"2021/03/15" "2021/03/21"
## [1] "2021/03/15" "2021/03/21"
%>%
data_combine inner_join(LIWC) %>%
group_by(artDate,sentiment,source) %>%
summarise(count = n()) %>%
filter(artDate>='2021-03-15') %>%
# 畫圖的部分
ggplot(aes(x= artDate,y=count,fill=sentiment)) +
scale_color_manual() +
geom_col(position="dodge") +
scale_x_date(labels = date_format("%m/%d")) +
labs(title = "sentiment of ptt & dcard",color = "情緒類別") +
facet_wrap(~source, ncol = 1, scales="free_y") # scale可以調整比例尺
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate', 'sentiment'. You can override using the `.groups` argument.
最後總結一下之前提出的問題:
1.鮭魚之亂的討論大概出現在哪個時間點,話題高峰在哪裡?
大概在3/15有較熱烈的討論,話題高峰出現在3/19,19號後討論熱度慢慢下降
2.正面和負面的討論內容各是甚麼,有沒有時間點上的差異?
在3/18號前雖然有負面評論,大依據情緒分數來看,評論大概維持正面,網友保持較為幽默的態度,大部分討論優惠內容本身,文字雲出現「好玩」、「優惠」、「不錯」等關鍵字;3/18號後負面評論大量增加,部分評論批評年輕人輕率改名,可能日後「後悔」,且「貪小便宜」、「丟臉」等等。
3.正面和負面討論的情緒分數哪個較高?
正面情緒分數在3/18前較高,3/18號後負面情緒慢慢增加,但ptt上兩者比例大約各半,不會相差太多。
1.算出dcard網友的正負面情緒分數,並用折線圖呈現在同一張圖上 2.畫出dcard網友正負面代表字的文字雲,觀察dcard和ptt上對鮭魚之亂評論的差異
### dcard網友的正負面情緒分數 ###
# Step1.中文斷詞 產生變數DToken
# Step2 計算詞頻 Word
# 2.1 格式化日期格式
$artDate=DToken$artDate %>% as.Date("%Y/%m/%d")
DToken# 2.2 過濾特殊字元
= DToken %>%
D_data filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# 2.3 計算每一天詞頻
<- D_data %>%
DToken_word_count select (artDate,word) %>%
group_by(artDate,word) %>%
summarise(count=n()) %>%
filter(count>3) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
# Step3 Inner join LIWC情緒字典
# sentiment_sum:word,sentiment,sum
%>%
sentiment_sum top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>% #重新排序word,
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
#依Word,情緒類型group
= DToken_word_count %>%
D_sentiment_type_count_By_Word inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(sum=sum(count)) %>%
arrange(desc(sum)) %>%
data.frame() #存成dataframe
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
#產生比較長條圖
%>%
D_sentiment_type_count_By_Word top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>% #重新排序word,
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
#產生正負比較文字雲
%>%
D_sentiment_type_count_By_Word acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"), # negative positive
max.words = 50)
#依日期,情緒類型group
= DToken_word_count %>%
D_sentiment_type_count_By_Date select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count_sentiment_typeByDate=n())
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
# 情緒折線圖 3/18-3/21 DCard討論的文章相對少很多
%>%
D_sentiment_type_count_By_Date ggplot()+
geom_line(aes(x=artDate,y=count_sentiment_typeByDate,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d")
,limits = as.Date(c('2021-03-18','2021-03-21'))
+
) # 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(D_sentiment_type_count_By_Date$artDate == as.Date('2021-03-18'))
1]])),colour = "red") [
# Step4 Spread sentiment欄位 , 到這裡是依日期所算出的情緒分數
= D_sentiment_type_count_By_Date %>%
D_article_sentiment_By_Date spread(sentiment,count_sentiment_typeByDate,fill = 0) %>%
mutate(type=case_when(positive>negative~"positive",TRUE~"negatvie")) %>%
mutate(s_score=positive-negative)
以讀書會為單位,針對有興趣的議題分析資料,作業轉成RPubs發布,並將連結上傳至網大「第五週HW」,每組一人上傳即可。