系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] ""

安裝需要的packages

# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

讀進library

library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
require(jiebaR)

資料基本介紹

  • 資料來源: 從文字平台蒐集PTT Gossip版2021-03-01 ~ 2021-03-26所有文章
  • 資料集: yen_ptt_cotton.csv
  • 關鍵字:新疆棉花、新疆棉、BCI、藝人、代言、解約、抵制
  • 資料時間:2021-03-01 ~ 2021-03-26

這次我們以最近發生的新疆血棉花事件,主要分析ptt上網友的相關討論,並對比udn news上大家的討論情形。本次主要針對以下方向分析:

1.新疆棉花事件的討論大概出現在哪個時間點,話題高峰在哪裡?
2.正面和負面的討論內容各是甚麼,有沒有時間點上的差異?
3.正面和負面討論的情緒分數大約多少?
4.探討各品牌在文章內被提及的正負面情緒。

ptt篩選文章必須要有「新疆棉花」主題和其他關鍵字。

# 把文章和留言讀進來
MetaData = fread('yen_ptt_cotton_articleMetaData.csv',encoding = 'UTF-8')
Reviews  = fread('yen_ptt_cotton_articleReviews.csv',encoding = 'UTF-8')

# 再篩一次文章 203 篇
keywords = c('新疆棉花','新疆棉','BCI','藝人','代言','解約','抵制')
toMatch = paste(keywords,collapse="|")
MetaData = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])

# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")

1. 資料前處理

(1). 文章斷詞

設定斷詞引擎

# 加入自定義的字典
jieba_tokenizer <- worker(user="./dict/cotton.txt", stop_word = "./dict/stop_words.txt")

# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
# 把文章和留言的斷詞結果併在一起
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)

# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 

(2). 資料基本清理

  • 日期格式化
  • 去除特殊字元、詞頻太低的字
# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")

# 過濾特殊字元(不去除英文字與數字,因為要觀察品牌)
data_select = data %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(nchar(.$word)>1) 
  
# 算每天不同字的詞頻
# word_count:artDate,word,count
word_count <- data_select %>%
  select(artDate,word) %>%
  group_by(artDate,word) %>%
  summarise(count=n()) %>%  # 算字詞單篇總數用summarise
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
word_count
## # A tibble: 2,513 x 3
## # Groups:   artDate [18]
##    artDate    word  count
##    <date>     <chr> <int>
##  1 2021-03-25 中國    835
##  2 2021-03-26 中國    770
##  3 2021-03-26 抵制    398
##  4 2021-03-25 抵制    366
##  5 2021-03-25 台灣    358
##  6 2021-03-26 台灣    324
##  7 2021-03-25 藝人    319
##  8 2021-03-26 藝人    243
##  9 2021-03-25 新疆    220
## 10 2021-03-25 品牌    199
## # ... with 2,503 more rows

2. 準備LIWC字典

全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版 分為正向情緒與負向情緒

讀檔,字詞間以“,”將字分隔

  • 正面新增:人權,挺,家鄉,祖國
  • 負面新增:不尊重,濫用,封殺,跪舔,開刀,違約金,舔共,支那,低能,腦粉,切割
P <- read_file("./dict/liwc/positive.txt") # 正向字典txt檔
N <- read_file("./dict/liwc/negative.txt") # 負向字典txt檔

#字典txt檔讀進來是一整個字串
typeof(P)
## [1] "character"

分割字詞,並將兩個情緒字典併在一起

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")

# 把兩個字典拼在一起
LIWC = rbind(P, N)

# 檢視字典
head(LIWC)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

3. 將文章和與LIWC情緒字典做join

在畫出情緒之前,觀察每天的發文情形,發現大約在3/23之後才開始有較多的討論(因為H&M發聲明抵制新疆棉花)。

正負情緒發文折線圖

MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot()+
    geom_line(aes(x=artDate,y=count))+
    scale_x_date(labels = date_format("%m/%d"))

找出文集中,對於LIWC字典是positive和negative的字

算出每天情緒總和(sentiment_count)

# sentiment_count:artDate,sentiment,count
sentiment_count = data_select %>%
  select(artDate,word) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=n())  
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

正負情緒分數折線圖

# 檢視資料的日期區間
range(sentiment_count$artDate) #"2021-03-01" "2021-03-27"
## [1] "2021-03-03" "2021-03-26"
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-03-20','2021-03-26'))
               )+
  # 加上標示日期的線
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-25'))
[1]])),color = "red") 
## Warning: Removed 27 row(s) containing missing values (geom_path).


如上圖,我們發現3/23前因為較少關於新疆棉花事件的貼文,版上討論的情緒起伏並不大;但是於3/23後(H&M發表聲明抵制新疆棉花),正負面情緒次數顯著增加,且於3/25達到次數上的差異最大( 負面情緒次數遠大於正面情緒次數 )。

正負情緒比例折線圖

sentiment_count %>% 
  # 標準化的部分
  group_by(artDate) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-03-20','2021-03-26'))
               )+
  # 加上標示日期的線
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-24'))
[1]])),colour = "red")
## Warning: Removed 27 row(s) containing missing values (geom_path).


如前述所提,新疆棉花之亂事件從3/23開始被熱烈討論,從圖中可以看出3/23之後正面情緒的文字比例急速下降,反之負面則上升。

於3/24許多藝人開始對於品牌抵制新疆棉花表達自己的立場,最常見的做法就是與自己代言的品牌終止合約、做切割;然而,其中一部分是台灣籍藝人,因此在ptt討論版上對於他們的行為有較負面的批評,因此我們猜測此為3/24後負面情緒持續上升的原因。

# 查看每天的情緒分數排名
sentiment_count %>%
  select(count,artDate) %>%
  group_by(artDate) %>%
  summarise(sum = sum(count)) %>%
  arrange(desc(sum))
## # A tibble: 20 x 2
##    artDate      sum
##    <date>     <int>
##  1 2021-03-25  2307
##  2 2021-03-26  2161
##  3 2021-03-24   428
##  4 2021-03-03    86
##  5 2021-03-04    36
##  6 2021-03-10    32
##  7 2021-03-06    26
##  8 2021-03-23    24
##  9 2021-03-18    20
## 10 2021-03-19    19
## 11 2021-03-09    17
## 12 2021-03-14    16
## 13 2021-03-16    14
## 14 2021-03-11     9
## 15 2021-03-20     9
## 16 2021-03-08     7
## 17 2021-03-12     3
## 18 2021-03-17     3
## 19 2021-03-05     2
## 20 2021-03-15     1

4. 畫出文字雲

挑出有興趣的日期,畫出文字雲看看都在討論甚麼主題。

2021-03-19 文字雲


從3/19的文字雲當中,可以推測出在3/23前H&M還未發表聲明抵制新疆棉花時,大家大多把焦點都聚焦在探討新疆血棉花事件的本身。並從上圖可以看到「獨立」、「黑人」、「歷史」、「統治」、「歧視」、「非裔」以及朝代關鍵字等等出現的頻率很高,猜測大家把此次事件與以前的因為種族歧視而受到迫害、被迫勞動等侵犯人權的事件聯想在一起,針對為何總是少數民族受到壓迫、為何歷史總是如此地相似進行了一番討論。

2021-03-25 文字雲

# 畫出文字雲

word_count %>% 
  filter(!(word %in% c("新疆棉花"))) %>%
  filter(artDate == as.Date('2021-03-25')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  filter(count>45) %>%   # 過濾出現太少次的字
  wordcloud2()
## Adding missing grouping variables: `artDate`


先從2021-03-25評論與情緒的高峰時間點來看,呼應上面的情緒分析,除了中國及台灣以外,「抵制」的出現頻率大於「支持」,可以反映出在3/25後對於新疆棉議題負面情緒的討論相較於正面來得多。

另外,由於新疆棉花事件引發的少數民族人權問題再發展至美國品牌發表聲明抵制新疆棉花,從而演變成藝人若有與抵制新疆棉花的品牌有合作關係(ex.代言),就必須得火速撇清與該品牌的關係的亂象。而國籍問題一直以來在台灣都為較敏感的話題,故倘若台灣藝人發表與代言品牌終止合約的聲明,就會被鄉民變相的認為藝人是在為了賺人民幣而去挺中國新疆棉花(但支持新疆棉花背後的意義會比較像是同意強迫新疆少數族裔勞動生產血棉花),引起鄉民的不滿。在上圖的文字雲中,可以看到因為台灣戲劇「想見你」而爆紅的「許光漢」討論度最高(因為許光漢與CK切割),再來就是藝人「歐陽」、「彭于晏」。

5.找出情緒字典代表字

算出所有字詞的詞頻(sentiment_sum),找出情緒代表字

正負情緒代表字

# sentiment_sum:word,sentiment,sum
sentiment_sum <- 
  word_count %>%
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = sum(count)
  ) %>% 
  arrange(desc(sum)) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

另外一種呈現方式:正負情緒文字雲

sentiment_sum %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"), # positive negative
                   max.words = 50)


可以看出正面的情緒代表字主要為「支持」、「人權」;負面代表字則是「抵制」、「支那」。

2021-03-25 正負情緒代表字(依據不同日期觀察情緒代表字的變化)

sentiment_sum_select <- 
word_count %>%
  filter(artDate == as.Date('2021-03-25')) %>% 
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = sum(count)
  ) %>% 
  arrange(desc(sum)) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum_select   %>%
  top_n(30,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment 0325",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()


可以看出3/25因為藝人與品牌切割,所以「違約金」、「可憐」、「噁心」、「失望」等負面的情緒字上升。

2021-03-25 正負情緒文字雲

sentiment_sum_select %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"), # positive negative
                   max.words = 50)

6.歸類正負面文章

之前的情緒分析大部分是全部的詞彙加總,接下來將正負面情緒的文章分開,看看能不能發現一些新的東西。 接下來歸類文章,將每一篇文章正負面情緒的分數算出來,然後大概分類文章屬於正面還是負面。

# 依據情緒值的正負比例歸類文章
article_type = 
  data_select %>%
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=n()) %>%
  spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
  mutate(type = case_when(positive > negative ~ "positive", 
                             TRUE ~ "negative")) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
# 看一下正負比例的文章各有幾篇
article_type %>%
  group_by(type) %>%
  summarise(count = n())
## # A tibble: 2 x 2
##   type     count
##   <chr>    <int>
## 1 negative   154
## 2 positive    46

正負情緒文章數量統計圖

# 
article_type_date = left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")


article_type_date %>%
  group_by(artDate,type) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = artDate, y = count, fill = type)) + 
  geom_bar(stat = "identity", position = "dodge")+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-03-20','2021-03-27'))
               )
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## Warning: Removed 22 rows containing missing values (geom_bar).


可以看到約3/24號之後,負面文章大量增加。

#把正面和負面的文章挑出來,並和斷詞結果合併。

# negative_article:artUrl,word
negative_article <-
article_type %>%
  filter(type=="negative")%>%
  select(artUrl) %>%
  left_join(data_select[,c("artUrl", "word")], by = "artUrl")

# positive_article:artUrl,word
positive_article <-
article_type %>%
  filter(type=="positive")%>%
  select(artUrl) %>%
  left_join(data_select[,c("artUrl", "word")], by = "artUrl")

情緒關鍵字:負面情緒文章(畫出在負面情緒文章內,正負向情緒貢獻度較高的關鍵字)

# 負面情緒關鍵字貢獻圖
negative_article %>%
inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = n()
    )%>% 
  arrange(desc(sum)) %>%
  data.frame() %>%
  top_n(30,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to negative sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

情緒關鍵字:正面情緒文章(畫出在正面情緒文章內,正負向情緒貢獻度較高的關鍵字)

# 正面情緒關鍵字貢獻圖
positive_article %>%
inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = n()
    )%>% 
  arrange(desc(sum)) %>%
  data.frame() %>%
  top_n(30,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to positive sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.


從正負面情緒圖觀察發現,正面和負面的關鍵字沒有甚麼顯著的差異,負面情緒較高的文章比較常出現「抵制」、「支那」、「可憐」等討論台灣籍藝人支持新疆棉而抵制代言品牌的負面影響字詞,比較著重在批評台灣藝人支持中國新疆本身;正面情緒較高的文章出現較多的「支持」、「人權」等字詞,著重在討論中國籍藝人支持祖國,或是品牌上不用新疆棉是因為重視人權的議題。

情緒值正負比例:各品牌在文章內被提及的正負面情緒

# -----------------依據情緒值的正負比例歸類文章
article_type = 
  data_select %>%
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=n()) %>%
  spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
  mutate(type = case_when(positive > negative ~ "positive",
                          TRUE ~ "negative")) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
# -----------------計算adidas正負討論文章數量
adidas_Brand_all = 
  data_select %>%
  filter(word == "adidas" | word == "愛迪達") %>%
  inner_join(article_type) %>%
  mutate(brand='adidas')%>%
  group_by(brand,type)%>%
  summarise(count=n())%>%
  data.frame() 
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
# -----------------計算puma正負討論文章數量
puma_Brand_all = 
  data_select %>%
  filter(word == "puma" | word == "亞馬遜") %>%
  inner_join(article_type) %>%
  mutate(brand='puma')%>%
  group_by(brand,type)%>%
  summarise(count=n())%>%
  data.frame() 
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
# -----------------UNIQLO
UNIQLO_Brand_all = 
  data_select %>%
  filter(word == "uniqlo") %>%
  inner_join(article_type) %>%
  mutate(brand='uniqlo')%>%
  group_by(brand,type)%>%
  summarise(count=n())
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
# -----------------converse
converse_Brand_all = 
  data_select %>%
  filter(word == "converse") %>%
  inner_join(article_type) %>%
  mutate(brand='converse')%>%
  group_by(brand,type)%>%
  summarise(count=n())
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
# -----------------Balance
Balance_Brand_all = 
  data_select %>%
  filter(word == "balance") %>%
  inner_join(article_type) %>%
  mutate(brand='NEWbalance')%>%
  group_by(brand,type)%>%
  summarise(count=n())
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
# -----------------GAP
GAP_Brand_all = 
  data_select %>%
  filter(word == "gap") %>%
  inner_join(article_type) %>%
  mutate(brand='gap')%>%
  group_by(brand,type)%>%
  summarise(count=n())
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
# -----------------Fila
Fila_Brand_all = 
  data_select %>%
  filter(word == "fila") %>%
  inner_join(article_type) %>%
  mutate(brand='Fila')%>%
  group_by(brand,type)%>%
  summarise(count=n())
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
# -----------------ZARA
ZARA_Brand_all = 
  data_select %>%
  filter(word == "zara") %>%
  inner_join(article_type) %>%
  mutate(brand='ZARA')%>%
  group_by(brand,type)%>%
  summarise(count=n())
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
brand='ZARA'
type='positive'
count=0
tmp=data.frame(brand,type,count)
ZARA_Brand_all=rbind(ZARA_Brand_all,tmp)
# -----------------無印良品
MUJI_Brand_all = 
  data_select %>%
  filter(word == "muji"|word =='無印良品') %>%
  inner_join(article_type) %>%
  mutate(brand='MUJI')%>%
  group_by(brand,type)%>%
  summarise(count=n())
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
# -----------------LV
LV_Brand_all = 
  data_select %>%
  filter(word == "lv"|word =='lvmh') %>%
  inner_join(article_type) %>%
  mutate(brand='LV')%>%
  group_by(brand,type)%>%
  summarise(count=n())
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
# -----------------nike
nike_Brand_all = 
  data_select %>%
  filter(word == "nike") %>%
  inner_join(article_type) %>%
  mutate(brand='nike')%>%
  group_by(brand,type)%>%
  summarise(count=n())
## Joining, by = "artUrl"
## `summarise()` has grouped output by 'brand'. You can override using the `.groups` argument.
# -----------------------------------------
  Brand_all = 
    rbind(adidas_Brand_all,puma_Brand_all,UNIQLO_Brand_all,converse_Brand_all
          ,Balance_Brand_all,GAP_Brand_all,Fila_Brand_all,ZARA_Brand_all
          ,MUJI_Brand_all,LV_Brand_all,nike_Brand_all)#%>%
    # spread(type,count)
# ----------------------------------畫圖
Brand_all%>%
  ggplot(aes(x = count, y = brand, fill = type)) + 
  geom_bar(stat = "identity", position = "dodge")


以我們的觀點來說,會認為如果品牌使用新疆棉花的話照理來說應該是負面情緒較多,因為這樣代表是在支持強迫少數民族勞動生產棉花等等。但是從圖中可以發現fila是跟其他品牌差異較大的( 正面情緒字大於負面情緒字 ),與我們所認知的不同,所以我們深度探究發現這是因為由於fila在ptt上文章大多數為新聞內容,而內容中會提及中國網友表示:「這發言太帥了」、「以後只買斐樂(FILA)」或是「嗚嗚嗚,可以放心買了」,再加上文章內容一直出現「支持」、「表態」較正面字詞,導致文章在判斷正負面的情況,會與其他品牌有差異。

7.加入其他資料來源比較

# 加入聯合新聞網資料作比較
News  = fread('yen_news_cotton_articleMetaData.csv',encoding = 'UTF-8')
NToken <- News %>% unnest_tokens(word, sentence, token=customized_tokenizer)
PTT_Token <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 

PTT_Token = PTT_Token %>% mutate(source = "ptt")
News_Token = NToken %>% mutate(source = "news")

# 把資料併在一起
data_combine = rbind(PTT_Token,News_Token[,c("artDate","artUrl", "word","source")])

data_combine$artDate= data_combine$artDate %>% as.Date("%Y/%m/%d")

ptt、news情緒分數比較

range(News$artDate) #"2021/03/20" "2021/03/27"
## [1] "2021/03/02" "2021/03/27"
data_combine %>%
  inner_join(LIWC) %>%
  group_by(artDate,sentiment,source) %>%
  summarise(count = n()) %>%
  filter(artDate>='2021-03-20') %>%
  
  # 畫圖的部分
  ggplot(aes(x= artDate,y=count,fill=sentiment)) +
  scale_color_manual() +
  geom_col(position="dodge") + 
  scale_x_date(labels = date_format("%m/%d")) +
  labs(title = "sentiment of ptt & news",color = "情緒類別") +
  facet_wrap(~source, ncol = 1, scales="free_y")  # scale可以調整比例尺
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate', 'sentiment'. You can override using the `.groups` argument.


從ptt和udn news的情緒直方圖分布,可以發現udn news與ptt皆為負面情緒多於正面情緒,而在正面情緒的部分news相較於ptt稍多(猜測是因為新聞多數是以中國網友的角度來敘述這件事情,例如:我們上面提及的fila支持中國新疆棉,中國網友對此大多給予肯定、支持的正面情緒,而ptt比較多的是在表達台灣鄉民對於中國新疆血棉花事件與藝人切割品牌的批評);話題討論高峰的時間點也大致相同。

總結

1.新疆棉花事件的討論大概出現在哪個時間點,話題高峰在哪裡?

大概在3/24之後有較熱烈的討論,話題高峰出現在3/25,25號後討論熱度有下降趨勢,但討論還是算正熱烈。

2.正面和負面的討論內容各是甚麼,有沒有時間點上的差異?

正面的討論內容主要是關於品牌抵制使用新疆棉花是由於注重人權方面的議題; 負面的討論內容主要是關於台灣藝人支持新疆棉花,抵制及切割代言品牌,被台灣人民認為是舔共的行為。 由於新疆棉花事件的爆發是從3/23 H&M發表聲明抵制新疆棉花開始,事件還在持續燃燒中,因此討論聲浪上升是正面和負面一起增加,目前沒有看出時間點上的差異。

3.正面和負面討論的情緒分數哪個較高?

負面情緒分數一直都高於正面情緒,這可能是因為網友對於台灣藝人親中的態度一直以來多數都是批評反對的。並且大家對於壓迫少數民族勞動生產血棉花,已經侵犯了人權等議題一直以來從美國黑奴時就一直持有反對的意見。

4.探討各品牌在文章內被提及的正負面情緒。

Nike、Adidas、LV在文章內被討論伴隨著負面情緒的次數最多,最有趣的現象為唯獨只有fila的正面情緒字多過負面情緒字。但如我們上面所推測,一般而言,品牌抵制新疆血棉花,代表它們不願意壓迫少數民族、尊重勞工人權,所以品牌抵制新疆棉應該要是正面情緒較多才對;然而,我們跑出來的圖卻與事實相反,fila發表聲明支持新疆棉花,反倒得到了更多的正面情緒,應該就是因為ptt大多關於品牌抵制或支持新疆棉的文章皆為以中國網友角度而產生的報導(故支持=愛國=得到中國網友的肯定),所以跑出來的圖才會與我們的認知不符。