library(data.table)
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(wordcloud2)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
資料搜集從 2017/1/1至2019/4/15 總計有609篇與食品安全相關的文章
setwd("~/chloe_project/")
food_data <- fread("./0509_food_data_artWordFreq.csv", encoding = "UTF-8")
food_data$artDate = food_data$artDate %>% as.Date("%Y/%m/%d")
str(food_data)
## Classes 'data.table' and 'data.frame': 51830 obs. of 6 variables:
## $ artTitle: chr "[問卦]基改食品真的不好嗎?" "[問卦]基改食品真的不好嗎?" "[問卦]基改食品真的不好嗎?" "[問卦]基改食品真的不好嗎?" ...
## $ artDate : Date, format: "2017-01-05" "2017-01-05" ...
## $ artTime : chr "17:46:05" "17:46:05" "17:46:05" "17:46:05" ...
## $ artUrl : chr "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" "https://www.ptt.cc/bbs/Gossiping/M.1483667528.A.9D6.html" ...
## $ word : chr "自然" "基改" "食品" "不好" ...
## $ count : int 4 3 2 2 2 2 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
因為做過斷詞了,所以同一篇文章的單字會被重複列出很多次 列出不重複的文章日期和文章連結
# 把重複的文章日期與文章連結組合去除
food_data_perDate <- food_data %>%
select(artDate, artUrl) %>%
distinct()
# 計算每天有多少篇文章
article_count_by_date <- food_data_perDate %>%
group_by(artDate) %>%
summarise(count = n())
plot_article_by_date <-
article_count_by_date %>%
# aesthetics
ggplot(aes(x = artDate, y = count)) +
# geometrics
geom_line(color = "#00AFBB", size = 2) +
# coordinates
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("食品議題 討論文章數") +
xlab("日期") +
ylab("數量") +
# theme
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
plot_article_by_date
由上圖去察看兩個文章篇數特別高的時間點 可以發現 2018/1/29、2018/1/30、2017/3/26、2018/1/28、2017/2/22 這五天的食安相關文章較其他天多
# 將每日的文章數量依數量大小排序
article_count_by_date %>%
arrange(desc(count))
## # A tibble: 385 x 2
## artDate count
## <date> <int>
## 1 2018-01-29 8
## 2 2018-01-30 7
## 3 2017-03-26 6
## 4 2018-01-28 6
## 5 2017-02-22 5
## 6 2017-04-08 5
## 7 2017-05-18 5
## 8 2017-08-29 5
## 9 2018-12-06 5
## 10 2017-06-14 4
## # … with 375 more rows
food_data_word_count <-
food_data %>%
filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
word == "臺灣" | word == "現在" | word == "不是")) %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
filter(sum > 10) %>%
arrange(desc(sum))
wordcloud2(food_data_word_count)
food_data_word_count_hist <-
food_data_word_count %>%
top_n(30, sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(x=word, y=sum)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y="詞頻") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
food_data_word_count_hist
data_before_voted <- food_data %>% filter(artDate <= "2018-11-24")
data_after_voted <- food_data %>% filter(artDate > "2018-11-24")
data_before_voted_plot <-
data_before_voted %>%
filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
word == "臺灣" | word == "現在" | word == "不是")) %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
filter(sum > 20) %>%
arrange(desc(sum)) %>%
wordcloud2()
data_after_voted_plot <-
data_after_voted %>%
filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
word == "臺灣" | word == "現在" | word == "不是")) %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
filter(sum > 10) %>%
arrange(desc(sum)) %>%
wordcloud2()
data_before_voted_plot
data_after_voted_plot
food_data <-
food_data %>%
filter(!grepl('_',word))
# 載入情緒字典
positive_dict <- read_file("./positive.txt")
negative_dict <- read_file("./negative.txt")
positive <- strsplit(positive_dict, "[,]")[[1]]
negative <- strsplit(negative_dict, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
# expand.grid 列出vector間的所有組合
# 所以這邊會把全部日期和positive及negative的組合列出來
all_dates <-
expand.grid(seq(as.Date(min(food_data$artDate)), as.Date(max(food_data$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")
plot_table <- food_data %>%
select(artDate,word,count) %>%
inner_join(LIWC_ch) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
#沒有資料的日期將count設為0
plot_table <- all_dates %>%
merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
all.x=T,all.y=T) %>%
mutate(count = tidyr::replace_na(count, 0))
把情緒走勢畫在一起,可以發現負面的部分佔據比較多
plot_table %>%
ggplot() +
geom_line(aes(x = artDate,y = count,colour = sentiment)) +
scale_x_date(labels = date_format("%m/%d"))
把每天的每篇文章正面情緒減去負面情緒
senti_by_date_LIWC <- food_data %>%
inner_join(LIWC_ch) %>%
group_by(artDate, sentiment) %>%
summarise(n = sum(count)) %>%
tidyr::spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(method = 'LIWC')
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
senti_by_date_LIWC
## # A tibble: 349 x 5
## # Groups: artDate [349]
## artDate positive negative sentiment method
## <date> <dbl> <dbl> <dbl> <chr>
## 1 2017-01-05 0 5 -5 LIWC
## 2 2017-01-07 1 2 -1 LIWC
## 3 2017-01-08 2 0 2 LIWC
## 4 2017-01-10 9 2 7 LIWC
## 5 2017-01-15 11 3 8 LIWC
## 6 2017-01-18 21 5 16 LIWC
## 7 2017-01-19 8 7 1 LIWC
## 8 2017-01-20 1 2 -1 LIWC
## 9 2017-01-24 3 2 1 LIWC
## 10 2017-01-25 5 18 -13 LIWC
## # … with 339 more rows
# senti_by_date_LIWC %>%
# ggplot(aes(x= artDate, y = sentiment)) +
# geom_col(show.legend = FALSE) +
# scale_x_date(labels = date_format("%m/%d")) +
# facet_wrap(~method, ncol = 1, scales = "fixed")+
# geom_text(aes(label = sentiment))
LIWC 字典分析最常見正面及負面情緒字
food_data_word_count %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(10, wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text = element_text(size = 14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
把資料分成公投前和公投後去看用到的情緒詞有什麼不同
data_before_voted <- food_data %>% filter(artDate <= "2018-11-24")
data_after_voted <- food_data %>% filter(artDate > "2018-11-24")
food_data_word_count_before <-
data_before_voted %>%
filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
word == "臺灣" | word == "現在" | word == "不是")) %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
filter(sum > 10) %>%
arrange(desc(sum))
food_data_word_count_after <-
data_after_voted %>%
filter(!(word == "食品" | word == "台灣" | word=="表示" | word == "有沒有" | word == "八卦" |
word == "臺灣" | word == "現在" | word == "不是")) %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
filter(sum > 10) %>%
arrange(desc(sum))
food_data_word_count_before %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(15, wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text = element_text(size = 14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
food_data_word_count_after %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(15, wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text = element_text(size = 14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector