載入 packages

library(data.table)
library(magrittr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(wordcloud2)
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract

載入資料

資料搜集從 2017/1/1至2019/4/15 總計有609篇與食品安全相關的文章

setwd("~/chloe_project/")
food_data <- fread("./0509_food_data_artWordFreq.csv", encoding = "UTF-8")
food_data$artDate = food_data$artDate %>% as.Date("%Y/%m/%d")
str(food_data)
## Classes 'data.table' and 'data.frame':   51830 obs. of  6 variables:
##  $ artTitle: chr  "[問卦]基改食品真的不好嗎?" "[問卦]基改食品真的不好嗎?" "[問卦]基改食品真的不好嗎?" "[問卦]基改食品真的不好嗎?" ...
##  $ artDate : Date, format: "2017-01-05" "2017-01-05" ...
##  $ artTime : chr  "17:46:05" "17:46:05" "17:46:05" "17:46:05" ...
##  $ artUrl  : chr  "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" ...
##  $ word    : chr  "自然" "基改" "食品" "不好" ...
##  $ count   : int  4 3 2 2 2 2 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

畫出日期與文章數的關係圖

因為做過斷詞了,所以同一篇文章的單字會被重複列出很多次 列出不重複的文章日期和文章連結

# 把重複的文章日期與文章連結組合去除
food_data_perDate <- food_data %>% 
  select(artDate, artUrl) %>% 
  distinct()

# 計算每天有多少篇文章
article_count_by_date <- food_data_perDate %>% 
  group_by(artDate) %>% 
  summarise(count = n())

plot_article_by_date <- 
  article_count_by_date %>% 
  # aesthetics
  ggplot(aes(x = artDate, y = count)) +
  # geometrics
  geom_line(color = "#00AFBB", size = 2) + 
  # coordinates
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("食品議題 討論文章數") + 
  xlab("日期") + 
  ylab("數量") + 
  # theme
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。

plot_article_by_date

由上圖去察看兩個文章篇數特別高的時間點 可以發現 2018/1/29、2018/1/30、2017/3/26、2018/1/28、2017/2/22 這五天的食安相關文章較其他天多

# 將每日的文章數量依數量大小排序
article_count_by_date %>% 
  arrange(desc(count))
## # A tibble: 385 x 2
##    artDate    count
##    <date>     <int>
##  1 2018-01-29     8
##  2 2018-01-30     7
##  3 2017-03-26     6
##  4 2018-01-28     6
##  5 2017-02-22     5
##  6 2017-04-08     5
##  7 2017-05-18     5
##  8 2017-08-29     5
##  9 2018-12-06     5
## 10 2017-06-14     4
## # … with 375 more rows

文字雲

food_data_word_count <- 
  food_data %>% 
  filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
           word == "臺灣" | word == "現在" | word == "不是")) %>% 
  group_by(word) %>% 
  summarise(sum = sum(count)) %>% 
  filter(sum > 10) %>% 
  arrange(desc(sum)) 

wordcloud2(food_data_word_count)

字頻長條圖

food_data_word_count_hist <- 
  food_data_word_count %>% 
  top_n(30, sum) %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(x=word, y=sum)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y="詞頻") +
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

food_data_word_count_hist

看公投前後民眾反應

data_before_voted <- food_data %>% filter(artDate <= "2018-11-24")
data_after_voted <- food_data %>% filter(artDate > "2018-11-24")

data_before_voted_plot <- 
  data_before_voted %>% 
  filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
           word == "臺灣" | word == "現在" | word == "不是")) %>% 
  group_by(word) %>% 
  summarise(sum = sum(count)) %>% 
  filter(sum > 20) %>% 
  arrange(desc(sum)) %>% 
  wordcloud2()

data_after_voted_plot <-
  data_after_voted %>% 
  filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
           word == "臺灣" | word == "現在" | word == "不是")) %>% 
  group_by(word) %>% 
  summarise(sum = sum(count)) %>% 
  filter(sum > 10) %>% 
  arrange(desc(sum)) %>% 
  wordcloud2()

data_before_voted_plot
data_after_voted_plot

情緒分析

food_data <- 
  food_data %>% 
  filter(!grepl('_',word))

# 載入情緒字典
positive_dict <- read_file("./positive.txt")
negative_dict <- read_file("./negative.txt")

positive <- strsplit(positive_dict, "[,]")[[1]]
negative <- strsplit(negative_dict, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)

# expand.grid 列出vector間的所有組合
# 所以這邊會把全部日期和positive及negative的組合列出來
all_dates <- 
  expand.grid(seq(as.Date(min(food_data$artDate)), as.Date(max(food_data$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")

plot_table <- food_data %>%
  select(artDate,word,count) %>%
  inner_join(LIWC_ch) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))  
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
#沒有資料的日期將count設為0
plot_table <- all_dates %>% 
  merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
        all.x=T,all.y=T) %>% 
  mutate(count = tidyr::replace_na(count, 0))

把情緒走勢畫在一起,可以發現負面的部分佔據比較多

plot_table %>%
  ggplot() + 
  geom_line(aes(x = artDate,y = count,colour = sentiment)) +
  scale_x_date(labels = date_format("%m/%d")) 

把每天的每篇文章正面情緒減去負面情緒

senti_by_date_LIWC <- food_data %>% 
  inner_join(LIWC_ch) %>%
  group_by(artDate, sentiment) %>%
  summarise(n = sum(count)) %>%
  tidyr::spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  mutate(method = 'LIWC')
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
senti_by_date_LIWC
## # A tibble: 349 x 5
## # Groups:   artDate [349]
##    artDate    positive negative sentiment method
##    <date>        <dbl>    <dbl>     <dbl> <chr> 
##  1 2017-01-05        0        5        -5 LIWC  
##  2 2017-01-07        1        2        -1 LIWC  
##  3 2017-01-08        2        0         2 LIWC  
##  4 2017-01-10        9        2         7 LIWC  
##  5 2017-01-15       11        3         8 LIWC  
##  6 2017-01-18       21        5        16 LIWC  
##  7 2017-01-19        8        7         1 LIWC  
##  8 2017-01-20        1        2        -1 LIWC  
##  9 2017-01-24        3        2         1 LIWC  
## 10 2017-01-25        5       18       -13 LIWC  
## # … with 339 more rows
# senti_by_date_LIWC %>% 
#   ggplot(aes(x= artDate, y = sentiment)) +
#   geom_col(show.legend = FALSE) +
#   scale_x_date(labels = date_format("%m/%d")) +
#   facet_wrap(~method, ncol = 1, scales = "fixed")+
#   geom_text(aes(label = sentiment))

LIWC 字典分析最常見正面及負面情緒字

food_data_word_count %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10, wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text = element_text(size = 14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

把資料分成公投前和公投後去看用到的情緒詞有什麼不同

data_before_voted <- food_data %>% filter(artDate <= "2018-11-24")
data_after_voted <- food_data %>% filter(artDate > "2018-11-24")

food_data_word_count_before <- 
  data_before_voted %>% 
  filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
           word == "臺灣" | word == "現在" | word == "不是")) %>% 
  group_by(word) %>% 
  summarise(sum = sum(count)) %>% 
  filter(sum > 10) %>% 
  arrange(desc(sum)) 

food_data_word_count_after <-
  data_after_voted %>% 
  filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
           word == "臺灣" | word == "現在" | word == "不是")) %>% 
  group_by(word) %>% 
  summarise(sum = sum(count)) %>% 
  filter(sum > 10) %>% 
  arrange(desc(sum)) 

food_data_word_count_before %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(15, wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text = element_text(size = 14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

food_data_word_count_after %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(15, wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text = element_text(size = 14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector