載入 packages

library(data.table)
library(magrittr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(scales)
library(wordcloud2)
library(readr)

## 
## Attaching package: 'readr'

## The following object is masked from 'package:scales':
## 
##     col_factor

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:magrittr':
## 
##     extract

載入資料

資料搜集從 2017/1/1至2019/4/15 總計有609篇與食品安全相關的文章

setwd("~/chloe_project/")
food_data <- fread("./0509_food_data_artWordFreq.csv", encoding = "UTF-8")
food_data$artDate = food_data$artDate %>% as.Date("%Y/%m/%d")
str(food_data)

## Classes 'data.table' and 'data.frame':   51830 obs. of  6 variables:
##  $ artTitle: chr  "[問卦]基改食品真的不好嗎?" "[問卦]基改食品真的不好嗎?" "[問卦]基改食品真的不好嗎?" "[問卦]基改食品真的不好嗎?" ...
##  $ artDate : Date, format: "2017-01-05" "2017-01-05" ...
##  $ artTime : chr  "17:46:05" "17:46:05" "17:46:05" "17:46:05" ...
##  $ artUrl  : chr  "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" ...
##  $ word    : chr  "自然" "基改" "食品" "不好" ...
##  $ count   : int  4 3 2 2 2 2 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

畫出日期與文章數的關係圖

因為做過斷詞了，所以同一篇文章的單字會被重複列出很多次列出不重複的文章日期和文章連結

# 把重複的文章日期與文章連結組合去除
food_data_perDate <- food_data %>% 
  select(artDate, artUrl) %>% 
  distinct()

# 計算每天有多少篇文章
article_count_by_date <- food_data_perDate %>% 
  group_by(artDate) %>% 
  summarise(count = n())

plot_article_by_date <- 
  article_count_by_date %>% 
  # aesthetics
  ggplot(aes(x = artDate, y = count)) +
  # geometrics
  geom_line(color = "#00AFBB", size = 2) + 
  # coordinates
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("食品議題 討論文章數") + 
  xlab("日期") + 
  ylab("數量") + 
  # theme
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定，避免中文字顯示錯誤。

plot_article_by_date

由上圖去察看兩個文章篇數特別高的時間點可以發現 2018/1/29、2018/1/30、2017/3/26、2018/1/28、2017/2/22 這五天的食安相關文章較其他天多

# 將每日的文章數量依數量大小排序
article_count_by_date %>% 
  arrange(desc(count))

## # A tibble: 385 x 2
##    artDate    count
##    <date>     <int>
##  1 2018-01-29     8
##  2 2018-01-30     7
##  3 2017-03-26     6
##  4 2018-01-28     6
##  5 2017-02-22     5
##  6 2017-04-08     5
##  7 2017-05-18     5
##  8 2017-08-29     5
##  9 2018-12-06     5
## 10 2017-06-14     4
## # … with 375 more rows

文字雲

food_data_word_count <- 
  food_data %>% 
  filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
           word == "臺灣" | word == "現在" | word == "不是")) %>% 
  group_by(word) %>% 
  summarise(sum = sum(count)) %>% 
  filter(sum > 10) %>% 
  arrange(desc(sum)) 

wordcloud2(food_data_word_count)

字頻長條圖

food_data_word_count_hist <- 
  food_data_word_count %>% 
  top_n(30, sum) %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(x=word, y=sum)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y="詞頻") +
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

food_data_word_count_hist

看公投前後民眾反應

data_before_voted <- food_data %>% filter(artDate <= "2018-11-24")
data_after_voted <- food_data %>% filter(artDate > "2018-11-24")

data_before_voted_plot <- 
  data_before_voted %>% 
  filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
           word == "臺灣" | word == "現在" | word == "不是")) %>% 
  group_by(word) %>% 
  summarise(sum = sum(count)) %>% 
  filter(sum > 20) %>% 
  arrange(desc(sum)) %>% 
  wordcloud2()

data_after_voted_plot <-
  data_after_voted %>% 
  filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
           word == "臺灣" | word == "現在" | word == "不是")) %>% 
  group_by(word) %>% 
  summarise(sum = sum(count)) %>% 
  filter(sum > 10) %>% 
  arrange(desc(sum)) %>% 
  wordcloud2()

data_before_voted_plot

data_after_voted_plot

情緒分析

food_data <- 
  food_data %>% 
  filter(!grepl('_',word))

# 載入情緒字典
positive_dict <- read_file("./positive.txt")
negative_dict <- read_file("./negative.txt")

positive <- strsplit(positive_dict, "[,]")[[1]]
negative <- strsplit(negative_dict, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)

# expand.grid 列出vector間的所有組合
# 所以這邊會把全部日期和positive及negative的組合列出來
all_dates <- 
  expand.grid(seq(as.Date(min(food_data$artDate)), as.Date(max(food_data$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")

plot_table <- food_data %>%
  select(artDate,word,count) %>%
  inner_join(LIWC_ch) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

#沒有資料的日期將count設為0
plot_table <- all_dates %>% 
  merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
        all.x=T,all.y=T) %>% 
  mutate(count = tidyr::replace_na(count, 0))

把情緒走勢畫在一起，可以發現負面的部分佔據比較多

plot_table %>%
  ggplot() + 
  geom_line(aes(x = artDate,y = count,colour = sentiment)) +
  scale_x_date(labels = date_format("%m/%d"))

把每天的每篇文章正面情緒減去負面情緒

senti_by_date_LIWC <- food_data %>% 
  inner_join(LIWC_ch) %>%
  group_by(artDate, sentiment) %>%
  summarise(n = sum(count)) %>%
  tidyr::spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  mutate(method = 'LIWC')

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

senti_by_date_LIWC

## # A tibble: 349 x 5
## # Groups:   artDate [349]
##    artDate    positive negative sentiment method
##    <date>        <dbl>    <dbl>     <dbl> <chr> 
##  1 2017-01-05        0        5        -5 LIWC  
##  2 2017-01-07        1        2        -1 LIWC  
##  3 2017-01-08        2        0         2 LIWC  
##  4 2017-01-10        9        2         7 LIWC  
##  5 2017-01-15       11        3         8 LIWC  
##  6 2017-01-18       21        5        16 LIWC  
##  7 2017-01-19        8        7         1 LIWC  
##  8 2017-01-20        1        2        -1 LIWC  
##  9 2017-01-24        3        2         1 LIWC  
## 10 2017-01-25        5       18       -13 LIWC  
## # … with 339 more rows

# senti_by_date_LIWC %>% 
#   ggplot(aes(x= artDate, y = sentiment)) +
#   geom_col(show.legend = FALSE) +
#   scale_x_date(labels = date_format("%m/%d")) +
#   facet_wrap(~method, ncol = 1, scales = "fixed")+
#   geom_text(aes(label = sentiment))

LIWC 字典分析最常見正面及負面情緒字

food_data_word_count %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10, wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text = element_text(size = 14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

把資料分成公投前和公投後去看用到的情緒詞有什麼不同

data_before_voted <- food_data %>% filter(artDate <= "2018-11-24")
data_after_voted <- food_data %>% filter(artDate > "2018-11-24")

food_data_word_count_before <- 
  data_before_voted %>% 
  filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
           word == "臺灣" | word == "現在" | word == "不是")) %>% 
  group_by(word) %>% 
  summarise(sum = sum(count)) %>% 
  filter(sum > 10) %>% 
  arrange(desc(sum)) 

food_data_word_count_after <-
  data_after_voted %>% 
  filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
           word == "臺灣" | word == "現在" | word == "不是")) %>% 
  group_by(word) %>% 
  summarise(sum = sum(count)) %>% 
  filter(sum > 10) %>% 
  arrange(desc(sum)) 

food_data_word_count_before %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(15, wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text = element_text(size = 14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

food_data_word_count_after %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(15, wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text = element_text(size = 14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

food_privacy

載入 packages

載入資料

畫出日期與文章數的關係圖

文字雲

字頻長條圖

看公投前後民眾反應

情緒分析