資料由中山大學管理學院文字分析平台取得,在平台資料輸出區塊選擇「文章+詞彙+詞頻」選項,即可取得相同格式之csv檔案
本資料為2019/09/01 ~ 2020/04/23 PTT八卦版之資料,透過文字分析平台檢索「外送平台」、「Foodpanda」、「UberEats」等相關關鍵字,共搜尋到2311篇文章。
## [1] ""
packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr", "wordcloud")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
require(ggplot2)
require(data.table)
require(scales)
require(wordcloud2)
library(wordcloud)
require(readr)
require(stringr)
require(jiebaR)
require(tidytext)
require(NLP)
require(tidyr)
require(ggraph)
require(igraph)
require(reshape2)
require(widyr)
# 載入自平台下載下來的資料
order <- fread("gossip_article.csv", encoding = "UTF-8")
order$artDate = order$artDate %>% as.Date("%Y/%m/%d") # 將日期欄位格式由chr轉為date
# 資料處理
data <- order %>%
select(artDate, artUrl) %>% # 選出文章和日期欄位
distinct() # 一篇文章保留一列
按照日期分群,計算每一天共有幾篇討論文章,可看出討論「外送平台」的熱度
article_count_by_date <- data %>%
group_by(artDate) %>%
summarise(count = n())
article_count_by_date %>%
arrange(desc(count))%>%
top_n(10)
## Selecting by count
## # A tibble: 10 x 2
## artDate count
## <date> <int>
## 1 2019-10-14 203
## 2 2019-10-13 135
## 3 2019-10-15 122
## 4 2019-10-11 90
## 5 2019-10-16 77
## 6 2019-10-02 38
## 7 2019-10-20 36
## 8 2019-10-17 35
## 9 2019-10-18 35
## 10 2019-10-12 34
plot_date <-
# data
article_count_by_date %>%
# aesthetics
ggplot(aes(x = artDate, y = count)) +
# geometrics
geom_line(color = "#00AFBB", size = 1) +
geom_vline(xintercept = as.numeric(as.Date("2019-10-10")), col='red') +
geom_vline(xintercept = as.numeric(as.Date("2019-10-14")), col='red') +
geom_vline(xintercept = as.numeric(as.Date("2020-01-21")), col='red') +
# coordinates
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("外送平台討論文章數") +
xlab("日期") +
ylab("文章數")
plot_date
我們進一步將前期幅度較大的曲線移除,讓曲線走向更明顯
plot_date <-
# data
article_count_by_date %>%
filter(artDate >= "2019-10-31") %>%
# aesthetics
ggplot(aes(x = artDate, y = count)) +
# geometrics
geom_line(color = "#00AFBB", size = 1) +
geom_vline(xintercept = as.numeric(as.Date("2020-01-15")), col='red') +
geom_vline(xintercept = as.numeric(as.Date("2020-04-16")), col='red') +
# coordinates
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("外送平台討論文章數") +
xlab("日期") +
ylab("文章數")
plot_date
從以上兩圖中可以發現:
- 2019年10月10日,討論文章數突然上升
- 2019年10月14日,討論文章數達到最高點(203篇)
- 2020年01月15日,討論文章數為近期高峰
- 2020年01月21日,台灣出現首例新冠病毒確診病例,外送平台的聲量提升幅度並不明顯
- 2020年04月16日,討論文章數再度達到最高點
order %>%
filter(artDate == as.Date('2020/1/15')) %>%
distinct(artUrl, .keep_all = TRUE) %>%
distinct(artTitle, .keep_all = TRUE) %>%
select(artTitle)
## artTitle
## 1 [問卦]ubereats炸雞點了我食量4倍怎麼吃
## 2 [問卦]熊貓明天罷工,大家會怕嗎?
## 3 [問卦]熊貓有必要自相殘殺嗎?
## 4 Re:[問卦]熊貓有必要自相殘殺嗎?
## 5 [問卦]可憐哪!熊貓外送師被砍獎金
## 6 [問卦]熊貓開始罷工了嗎?
## 7 Re:[問卦]欸欸欸!聽說台灣今天熊貓罷工
## 8 [問卦]熊貓外送罷工有人被影響到嗎?
## 9 [問卦]今天中午能定熊貓嗎
## 10 Re:[問卦]今天中午能定熊貓嗎
## 11 [問卦]所以現在熊貓可以點外送嗎
## 12 [新聞]控「年前毀約」!台中60名熊貓外送員市府
## 13 [新聞]熊貓今罷送最多慢1小時…外送員:萬人響應卻無人棄單
## 14 [新聞]foodpanda罷工實測!點餐後30分就拿到
## 15 Re:[新聞]熊貓今罷送最多慢1小時…外送員:萬人響應卻無人棄單
## 16 [新聞]熊貓變更計薪制度內湖外送員控「變相減
## 17 [問卦]社會上為何充滿仇視外送員的八卦?
以同樣方式分析以上時間點,可觀察該時間點討論的議題有:
- 2019年10月10日,[新聞] 熊貓外送員深夜搏命送單!
- 2019年10月14日,[新聞] 不甩勞動部!foodpanda:不接受僱傭關係、外送員交通安全疑慮
- 2020年01月15日,[新聞] Foodpanda罷工
- 2020年04月16日,[新聞] Gogoro騎到飽禁商用「外送員遭罰萬元」
stop_words <- fread("stop_words.csv", encoding = "UTF-8") #設定停用字
order <- order %>%
filter(!word %in% stop_words$word)
#將文章按照文字分群並計算每一個字的總詞頻
word_count <- order %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))
head(word_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 外送員 3811
## 2 外送 3135
## 3 熊貓 1360
## 4 外送平台 634
## 5 公司 608
## 6 工作 561
## 7 店家 554
## 8 業者 551
## 9 平台 537
## 10 foodpanda 473
## 11 機車 469
## 12 發生 382
## 13 承攬 376
## 14 車禍 339
## 15 餐點 332
## 16 訂單 313
## 17 美食 309
## 18 服務 289
## 19 送餐 288
## 20 小時 279
依照新冠病毒確診首例日期(1/21)區分資料
data_before <- order %>% filter(artDate <= "2020-01-21")
data_after <- order %>% filter(artDate > "2020-01-21")
data_before <- data_before %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))
data_after <- data_after %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))
plot_before <- data_before %>% filter(sum > 150) %>% wordcloud2()
plot_before
plot_after <- data_after %>% filter(sum > 50) %>% wordcloud2()
plot_after
與文字雲相比,長條圖可以查看較精確的「最常出現詞彙」。
# 將1/21前後的資料加入一個欄位標示後合併起來
word_count <- rbind(data_before %>% mutate(type="before"), data_after %>% mutate(type="after"))
plot_merge <- word_count %>%
group_by(type) %>%
top_n(15, sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(x=word, y=sum, fill = type)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y="詞頻") +
facet_wrap(~type, ncol = 1, scales="free") +
coord_flip()
plot_merge
由上圖出現的字詞可以發現討論內容在1/21前後略有不同
接著再進一步將重複的詞彙移除,觀察更明顯的差異
plot_merge <- word_count %>%
group_by(type) %>%
top_n(30, sum) %>%
ungroup() %>%
group_by(word) %>%
filter(n()==1) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = type)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y="總和") +
facet_wrap(~type, ncol = 1, scales="free") +
coord_flip()
plot_merge
重複詞彙移除後,觀察1/21前與1/21後的討論差異:
議題由「交通安全」、「外送員屬於承攬還是僱傭」轉變為「居家隔離」、「疫情」
# 正向字典txt檔
P <- read_file("dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
# 將字串依,分割,strsplit回傳 list , 我們取出 list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe:有兩個欄位 word, sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
word_count %>% inner_join(LIWC)
order %>%
select(word) %>%
inner_join(LIWC)
sentiment_before <- order %>% filter(artDate <= "2020-01-21")
sentiment_count = sentiment_before %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2019/10/10'))[1]])),colour = "red") +
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2019/10/14'))[1]])),colour = "red")
觀察上圖情緒變化來回顧事件內容:負面情緒在10月10日突然大幅上升,在10月14日達到負面情緒最大值
外送員車禍致死新聞
order %>%
filter(artDate == as.Date('2019/10/10)')) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## # A tibble: 14 x 4
## # Groups: artUrl [14]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]【外送搏命1】熊貓外送員拚賺宵夜錢桃園~ 5
## 2 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]客控外送員棄單偷吃!本尊親回覆 案情逆~ 4
## 3 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]熊貓外送員深夜搏命送單!擦撞貨車噴飛~ 4
## 4 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]熊貓外送員深夜搏命送單!擦撞貨車噴飛~ 3
## 5 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]女熊貓外送上樓! 男顧客開門吐「3字」~ 3
## 6 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]熊貓外送員深夜搏命送單!擦撞貨車噴飛~ 2
## 7 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]【外送搏命1】熊貓外送員拚賺宵夜錢桃園~ 2
## 8 https://www.ptt.cc/bbs/Gossiping~ negative [問卦]外送員死了叫外送的心安嗎?~ 2
## 9 https://www.ptt.cc/bbs/Gossiping~ negative [問卦]住處不送Foodpanda什麼感覺?~ 1
## 10 https://www.ptt.cc/bbs/Gossiping~ negative [問卦]現在風向怎麼那麼仇視外送員?~ 1
## 11 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]熊貓外送員深夜搏命送單!擦撞貨車噴飛~ 1
## 12 https://www.ptt.cc/bbs/Gossiping~ negative [問卦]外送平台會不會導致臺灣亡國?~ 1
## 13 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]【外送搏命1】熊貓外送員拚賺宵夜錢~ 1
## 14 https://www.ptt.cc/bbs/Gossiping~ negative [問卦]台女還是台男叫外送的多啊?~ 1
# 10月10日文字雲
order %>%
filter(artDate == as.Date('2019/10/10')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
wordcloud2()
查詢10月10日當日文章「正面」與「負面」字詞的使用頻率
order %>%
filter(artDate == as.Date('2019/10/10')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
order %>%
filter(artDate %in% c(as.Date('2019/10/09'))) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## # A tibble: 1 x 4
## # Groups: artUrl [1]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.15~ negative [問卦]半夜兩點後外送軟體變廢物?~ 2
order %>%
filter(artDate %in% c(as.Date('2019/10/11'))) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## # A tibble: 57 x 4
## # Groups: artUrl [57]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]賣命!熊貓美食外送員遭撞慘死超驚悚畫~ 12
## 2 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]賣命!熊貓美食外送員遭撞慘死超驚悚畫~ 12
## 3 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]賣命!熊貓美食外送員遭撞慘死超驚悚畫~ 11
## 4 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]外送員車禍foodpanda回應了~ 9
## 5 https://www.ptt.cc/bbs/Gossiping~ negative [問卦]484該管外送員了 6
## 6 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]熊貓外送員車禍身亡時力參選人質疑平台~ 6
## 7 https://www.ptt.cc/bbs/Gossiping~ negative [問卦]作弄外送妹子好好玩XDDD~ 5
## 8 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]賣命!熊貓美食外送員遭撞慘死超驚悚畫~ 5
## 9 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]賣命!熊貓美食外送員遭撞慘死超驚悚畫~ 5
## 10 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]外送員車禍foodpanda回應了~ 4
## # ... with 47 more rows
經以上結果發現:
- 10月9日僅有一篇負面文章;
- 10月11日外送員車禍致死的隔天,開始出現大量文章討論車禍事件與外送員的交通安全疑慮
進一步觀察10月10日前後一天文章所使用的正面、負面字詞
order %>%
filter(artDate == as.Date('2019/10/09')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
order %>%
filter(artDate == as.Date('2019/10/11')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
可觀察在10號車禍事件發生之後,「違規」、「風險」、「危險」等負面字詞開始出現。
「事故」、「遺憾」、「傷亡」是10號與前後一天不同的字詞,可得知當日聚焦在討論車禍事件上。
accident_alias = c("事故","意外", "車禍", "致死")
accident_plot = order %>%
filter(nchar(.$word)>1) %>%
filter(word %in% accident_alias) %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot(aes(x = artDate, y=count)) +
geom_col() +
ggtitle("與交通事故有關的字詞出現時機點與次數") +
xlab("時間") +
ylab("次數")
accident_plot
將前期幅度較大的曲線移除,讓曲線走向更明顯
sentiment_before2 <- order %>%
filter(artDate <= "2020-01-21") %>%
filter(artDate >= "2019-10-31")
sentiment_count2 = sentiment_before2 %>%
select(artDate, word, count) %>%
inner_join(LIWC) %>%
group_by(artDate, sentiment) %>%
summarise(count=sum(count))
sentiment_count2 %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d")) +
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count2$artDate == as.Date('2020-01-15'))[1]])),colour = "red")
承第I章提到的議題時間點:1月15日外送員罷工當天文章的「負面」情緒達到高峰
查詢1月15日當日文章「正面」與「負面」字詞的使用頻率
order %>%
filter(artDate == as.Date('2020/1/15')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
可發現負面字詞中包含「不滿」、「抗爭」等與罷工有關的字詞存在
order_before <- order %>% filter(artDate <= "2020-01-21")
order_before <- order_before %>%
filter(!str_detect(word, regex("[0-9]"))) %>%
count(artTitle, word, sort = TRUE)
total_words <- order_before %>%
group_by(artTitle) %>%
summarize(total = sum(n)) %>%
arrange(desc(total))
order_before <- left_join(order_before, total_words)
total_words
## # A tibble: 1,537 x 2
## artTitle total
## <chr> <int>
## 1 Re:[新聞]勞檢認定foodpanda、UberEats假承攬真 1629
## 2 Re:[新聞]不甩勞動部!foodpanda:不接受僱傭關係 1091
## 3 Re:[新聞]外送員車禍foodpanda回應了 970
## 4 Re:[新聞]賣命!熊貓美食外送員遭撞慘死超驚悚畫 867
## 5 Re:[新聞]熊貓外送員「可以不要送上樓嗎」…她傻 820
## 6 Re:[新聞]勞動部認定Foodpanda、UberEats與外送 694
## 7 Re:[問卦]同意讓熊貓抽30%的店家在想什麼? 657
## 8 Re:[爆卦]外送員真的月入十萬呢呵呵呵 607
## 9 Re:[爆卦]勞動部認定熊貓跟ubereats是僱傭契約 556
## 10 Re:[問卦]熊貓出新招了這次連顧客一起承攬 532
## # ... with 1,527 more rows
order_tf_idf <- order_before %>%
bind_tf_idf(word, artTitle, n)
order_tf_idf %>%
filter(total > 20) %>%
arrange(desc(tf_idf))
## # A tibble: 76,057 x 7
## artTitle word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Re:[問卦]外送員車禍死了叫外送的會知道嗎?~ 人會 2 23 0.0870 4.25 0.369
## 2 [問卦]foodpanda免外送餐點縮水 天填 1 21 0.0476 7.34 0.349
## 3 [問卦]foodpanda免外送餐點縮水 王小華堡~ 1 21 0.0476 7.34 0.349
## 4 [問卦]foodpanda免外送餐點縮水 王官 1 21 0.0476 7.34 0.349
## 5 [問卦]foodpanda免外送餐點縮水 兌換券 1 21 0.0476 7.34 0.349
## 6 [問卦]foodpanda免外送餐點縮水 周宜 1 21 0.0476 7.34 0.349
## 7 [問卦]foodpanda免外送餐點縮水 問券 1 21 0.0476 7.34 0.349
## 8 [問卦]foodpanda免外送餐點縮水 紫好 1 21 0.0476 7.34 0.349
## 9 [問卦]foodpanda免外送餐點縮水 號才 1 21 0.0476 7.34 0.349
## 10 [問卦]foodpanda免外送餐點縮水 網小華堡~ 1 21 0.0476 7.34 0.349
## # ... with 76,047 more rows
order_before <- order %>% filter(artDate <= "2020-01-21")
order_before <- order_before %>%
filter(!str_detect(word, regex("[0-9]"))) %>%
count(artDate, word, sort = TRUE)
total_words <- order_before %>%
group_by(artDate) %>%
summarize(total = sum(n)) %>%
arrange(desc(total))
order_before <- left_join(order_before, total_words)
order_tf_idf_date <- order_before %>%
bind_tf_idf(word, artDate, n)
order_tf_idf_date %>%
filter(total > 20) %>%
arrange(desc(tf_idf))
## # A tibble: 66,649 x 7
## artDate word n total tf idf tf_idf
## <date> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 2019-12-27 火花 1 25 0.04 4.95 0.198
## 2 2019-12-27 完結 1 25 0.04 4.95 0.198
## 3 2019-12-27 完結篇 1 25 0.04 4.95 0.198
## 4 2019-12-27 起肖 1 25 0.04 4.95 0.198
## 5 2019-12-27 葉問 1 25 0.04 4.95 0.198
## 6 2019-12-27 擦出 1 25 0.04 4.95 0.198
## 7 2019-12-01 ZXKQYHP 1 28 0.0357 4.95 0.177
## 8 2019-12-01 心裏 1 28 0.0357 4.95 0.177
## 9 2019-12-01 樁腳 1 28 0.0357 4.95 0.177
## 10 2019-12-26 把車 1 28 0.0357 4.95 0.177
## # ... with 66,639 more rows
order_tf_idf_date %>%
filter(artDate == as.Date("2019-10-10") | artDate == as.Date("2019-10-14")) %>%
group_by(artDate) %>%
top_n(1) %>%
arrange(artDate)
## # A tibble: 2 x 7
## # Groups: artDate [2]
## artDate word n total tf idf tf_idf
## <date> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 2019-10-10 撞死 5 965 0.00518 2.31 0.0120
## 2 2019-10-14 勞健保 34 10080 0.00337 1.77 0.00597