概要:

分析新疆棉事件在PTT八卦版上面的情緒分佈狀況並做出解釋

前置作業:

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] ""
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
library(htmlwidgets)
library(webshot)
require(jiebaR)

載入檔案並再檢查一次:

MetaData = fread('PTT_articleMetaData.csv',encoding = 'UTF-8')
Reviews  = fread('PTT_articleReviews.csv',encoding = 'UTF-8')

keywords = c('新疆','棉','維吾爾人','BCI','H&M')
toMatch = paste(keywords,collapse="|")
MetaData = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])

# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")

設定斷詞:

jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")

customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}


MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)

# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 

data
##            artDate                                                   artUrl
##      1: 2021/03/05 https://www.ptt.cc/bbs/Gossiping/M.1614921934.A.3C8.html
##      2: 2021/03/05 https://www.ptt.cc/bbs/Gossiping/M.1614921934.A.3C8.html
##      3: 2021/03/05 https://www.ptt.cc/bbs/Gossiping/M.1614921934.A.3C8.html
##      4: 2021/03/05 https://www.ptt.cc/bbs/Gossiping/M.1614921934.A.3C8.html
##      5: 2021/03/05 https://www.ptt.cc/bbs/Gossiping/M.1614921934.A.3C8.html
##     ---                                                                    
## 148714: 2021/03/26 https://www.ptt.cc/bbs/Gossiping/M.1616775266.A.6DD.html
## 148715: 2021/03/26 https://www.ptt.cc/bbs/Gossiping/M.1616775266.A.6DD.html
## 148716: 2021/03/26 https://www.ptt.cc/bbs/Gossiping/M.1616775266.A.6DD.html
## 148717: 2021/03/26 https://www.ptt.cc/bbs/Gossiping/M.1616775266.A.6DD.html
## 148718: 2021/03/26 https://www.ptt.cc/bbs/Gossiping/M.1616775266.A.6DD.html
##         word
##      1: 現在
##      2: 只要
##      3: 有人
##      4: 聲援
##      5: 新疆
##     ---     
## 148714: 一樣
## 148715:   嗎
## 148716:    5
## 148717:   毛
## 148718: 還裝

資料清理:

# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")

# 過濾特殊字元
data_select = data %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
  filter(nchar(.$word)>1) 

# 算每天不同字的詞頻
# word_count:artDate,word,count
word_count <- data_select %>%
  select(artDate,word) %>%
  group_by(artDate,word) %>%
  summarise(count=n()) %>%  # 算字詞單篇總數用summarise
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
word_count
## # A tibble: 4,167 x 3
## # Groups:   artDate [11]
##    artDate    word  count
##    <date>     <chr> <int>
##  1 2021-03-25 中國    949
##  2 2021-03-26 中國    859
##  3 2021-03-26 新疆    506
##  4 2021-03-24 中國    439
##  5 2021-03-25 抵制    395
##  6 2021-03-25 新疆    361
##  7 2021-03-26 抵制    336
##  8 2021-03-25 台灣    304
##  9 2021-03-25 藝人    281
## 10 2021-03-26 台灣    280
## # ... with 4,157 more rows

準備情緒字典:

P <- read_file("positive.txt") # 正向字典txt檔
N <- read_file("negative.txt") # 負向字典txt檔

typeof(P)
## [1] "character"
typeof(N)
## [1] "character"
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)

# 檢視字典
head(LIWC)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive
head(N)
##       word sentiment
## 1 一無所有  negative
## 2 七竅生煙  negative
## 3     上當  negative
## 4     下流  negative
## 5     下等  negative
## 6     不仁  negative

文章和字典合併:

MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count))+
  scale_x_date(labels = date_format("%m/%d"))

# sentiment_count:artDate,sentiment,count
sentiment_count = data_select %>%
  select(artDate,word) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=n())  
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

檢視資料的範圍(日期)

range(sentiment_count$artDate) #"2021-03-05" "2021-03-26"
## [1] "2021-03-05" "2021-03-26"

正負情緒分數折線圖(y是情緒分數、x是日期)

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-03-05','2021-03-26'))
  )+
  # 加上標示日期的線
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-22'))
                                                 [1]])),colour = "red") 

可以發現在對岸狀況爆發之前,相關的討論並不多,但從22號事件爆發開始,就快速上升,符合直覺、常識的理解。而情緒基本上是一致的負面。

正負情緒比例折線圖(y是情緒分數、x是日期)

sentiment_count %>% 
  # 標準化的部分
  group_by(artDate) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-03-05','2021-03-28'))
  )+
  # 加上標示日期的線
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-22'))
                                                 [1]])),colour = "red")

比例上來看,22號事件爆發當天,產生了非常極端的負面情緒,之後則回歸到正常。可能是因為當天的時候大家都陷入情緒化的發言,之後回歸理性的討論才因此下降。

查看每天的情緒分數排名

sentiment_count %>%
  select(count,artDate) %>%
  group_by(artDate) %>%
  summarise(sum = sum(count)) %>%
  arrange(desc(sum))
## # A tibble: 12 x 2
##    artDate      sum
##    <date>     <int>
##  1 2021-03-26  2686
##  2 2021-03-25  2470
##  3 2021-03-24   898
##  4 2021-03-05   424
##  5 2021-03-19   260
##  6 2021-03-07   242
##  7 2021-03-11   125
##  8 2021-03-06   114
##  9 2021-03-23    27
## 10 2021-03-21    13
## 11 2021-03-08     5
## 12 2021-03-22     3

意外的是22號當天並沒有太多的反映,反而是之後才開始起來。這和分數折線圖是一致的,但和比例折線圖似乎就產生分歧。

03-26文字雲

word_count %>% 
  filter(!(word %in% c("新疆"))) %>%
  filter(artDate == as.Date('2021-03-26')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  filter(count>100) %>%   # 過濾出現太少次的字
  wordcloud2()
## Adding missing grouping variables: `artDate`

03-25文字雲

a <- word_count %>% 
  filter(!(word %in% c("新疆"))) %>%
  filter(artDate == as.Date('2021-03-25')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  filter(count>100) %>%   # 過濾出現太少次的字
  wordcloud2()
## Adding missing grouping variables: `artDate`
my_graph <- a
saveWidget(my_graph, "temp.html", selfcontained = F)
webshot("temp.html", "wc1.png", delay = 5, vwidth = 500, vheight = 500)

03-24文字雲

b <- word_count %>% 
  filter(!(word %in% c("新疆"))) %>%
  filter(artDate == as.Date('2021-03-24')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  filter(count>50) %>%   # 過濾出現太少次的字
  wordcloud2()
## Adding missing grouping variables: `artDate`
my_graph <- b
saveWidget(my_graph, "temp.html", selfcontained = F)
webshot("temp.html", "wc1.png", delay = 5, vwidth = 500, vheight = 500)

有趣的是,一致的都是有中國,但是另外一方(歐盟、BCI等)卻沒有什麼討論。反而是美國被頻頻提起。可能是因為大家在討論還沒有行動的美國之後會採取什麼行動(?)

找出情緒字典代表字(:

# sentiment_sum:word,sentiment,sum
sentiment_sum <- 
  word_count %>%
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = sum(count)
  ) %>% 
  arrange(desc(sum)) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

負面的代表字是『抵制』很好理解,但可能只流於標題意思或對事件的描述。而正面的『可以』、『支持』的對象是誰(抵制的人、新疆少數民族?)會有不同的解讀。某種程度上這恐怕沒有辦法幫助我們分析網民的立場和意見。

情緒字文字雲

sentiment_sum %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"), # positive negative
                   max.words = 50)

統計正負面文章:

# 依據情緒值的正負比例歸類文章
article_type = 
  data_select %>%
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=n()) %>%
  spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
  mutate(type = case_when(positive > negative ~ "positive", 
                             TRUE ~ "negative")) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
article_type %>%
  group_by(type) %>%
  summarise(count = n())
## # A tibble: 2 x 2
##   type     count
##   <chr>    <int>
## 1 negative   175
## 2 positive    76

負面文章數量較多

正負面文章統計圖:

article_type_date = left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")


article_type_date %>%
  group_by(artDate,type) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = artDate, y = count, fill = type)) + 
  geom_bar(stat = "identity", position = "dodge")+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-03-24','2021-03-26'))
               )
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## Warning: Removed 16 rows containing missing values (geom_bar).

除了第一天是正面外,之後都是壓倒性的負面。但值得深思的是,『抵制』這個詞本身被歸類在負面詞,但是在這次事件上,『抵制』卻也可能只是對事件的描述。可能要做進一步的調整和分析才能下結論。