Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.6.3
library(data.table)
## Warning: package 'data.table' was built under R version 3.6.3
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.6.3
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(readr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 3.6.3
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.6.3
require(widyr)
## Loading required package: widyr
require(NLP)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
require(ggraph)
## Loading required package: ggraph
## Warning: package 'ggraph' was built under R version 3.6.3
require(igraph)
## Loading required package: igraph
## Warning: package 'igraph' was built under R version 3.6.3
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
八卦版
data_gossip = fread('./gossip_article.csv',encoding = 'UTF-8')
過濾特殊字元
data_gossip = data_gossip %>%
filter(!grepl('_',word))
轉換日期格式
data_gossip$artDate = data_gossip$artDate %>% as.Date("%Y/%m/%d")
把代表UberEats的詞統一
UberEats = c("ubereats","ubereat","Ubereat","UBEREATS","UberEATS","uberEATS")
data_gossip$word[which(data_gossip$word %in% UberEats)] = "UberEats"
找出有UberEats的文章網址
ubereats_url = data_gossip$artUrl[grepl("UberEats", data_gossip$word)]
依據網址找出有UberEats文章
data_ubereats <- data_gossip %>%
filter(data_gossip$artUrl %in% ubereats_url)
去除停用詞
jieba_tokenizer = worker()
stop_words <- c("https", "com", "新聞", "完整", "沒有","有沒有","現在","八卦","jpg","imgur","news","http","www","udn","gif")
data_ubereats <- data_ubereats %>%
filter(!(data_ubereats$word %in% stop_words))
每天篇數
ubereats_day <- data_ubereats %>%
group_by(artDate) %>%
summarise(count = n()) %>%
arrange(desc(count))
day_plot <- ubereats_day %>%
ggplot(aes(x = artDate, y = count)) +
geom_line(color = "blue", size = 1) +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("ubereats每日討論篇數") +
xlab("日期") +
ylab("數量")
day_plot
查看前五多的幾天
ubereats_day %>%
top_n(5)
## Selecting by count
## # A tibble: 5 x 2
## artDate count
## <date> <int>
## 1 2019-10-14 3077
## 2 2019-10-15 1715
## 3 2019-10-13 1498
## 4 2019-10-24 824
## 5 2019-10-18 677
篇數最多的五天中,最常出現的詞
plot_top5 <- data_ubereats %>%
filter(artDate == as.Date("2019/10/14") |
artDate == as.Date("2019/10/15") |
artDate == as.Date("2019/10/13") |
artDate == as.Date("2019/10/24") |
artDate == as.Date("2019/10/18")) %>%
group_by(artDate) %>%
top_n(7, count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(x=word, y=count, fill = artDate)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = NULL) +
facet_wrap(~artDate, scales="free", ncol = 2) +
coord_flip()
plot_top5
查看10/14的文章標題
data_ubereats %>%
filter(artDate == as.Date('2019/10/14')) %>%
distinct(artUrl, .keep_all = TRUE) %>%
select(artTitle)
## artTitle
## 1 [問卦]用電動自行車來外送可行嗎?
## 2 [新聞]Uber併智利生鮮外送商
## 3 [新聞]外送員連三起傷亡事故交通部長林佳龍說
## 4 [爆卦]勞動部認定熊貓跟ubereats是僱傭契約
## 5 [新聞]勞動部認定Foodpanda、UberEats與外送
## 6 [新聞]外送平台讓生意爆紅?業者親揭殘酷真相「
## 7 [新聞]外送車禍又1人!女送餐員和小貨車相撞
## 8 [新聞]北市勞動局︰UberEats違7項職安法規、Fo
## 9 [新聞]外送員車禍亡北市勞檢業者涉違反7項職安
## 10 [新聞]又一起美食外送員車禍女騎士手腳擦傷
## 11 [新聞]外送員屬僱傭關係健保署也加入戰場:得
## 12 [新聞]勞部研擬外送員強制納保意外險
## 13 [新聞]5天3死!外送員撞死違規過馬路老翁
## 14 [問卦]慣老闆沒有奴工用開始搞外送業的卦?
## 15 [新聞]勞檢認定foodpanda、UberEats假承攬真
## 16 [新聞]又一人「送」命!Lalamove外送員撞死老翁
## 17 [新聞]美食外送為僱傭律師:用牛刀管理將打擊
## 18 [問卦]為何ubereat不能像uber一樣高素質
## 19 [新聞]外送員列入雇傭關係後月入10萬夢碎?
## 20 Re:[新聞]勞檢認定foodpanda、UberEats假承攬真
## 21 Re:[新聞]不甩勞動部!foodpanda:不接受僱傭關係
## 22 [新聞]外送爭議蘇貞昌:勞檢不再憂讒畏譏
## 23 [新聞]外送事故5天3死!Lalamove外送員撞死違規
## 24 Re:[新聞]不甩勞動部!foodpanda:不接受僱傭關係
## 25 [新聞]【血汗外送】狠打臉熊貓! 勞檢驚揭業者兩點內規認證「無良
## 26 [問卦]外送三雄大家偏好哪個????
## 27 [新聞]外送員5天3死公路總局開罰foodpanda9千
## 28 Re:[新聞]北市勞工局開罰UberEats認定罹災黃姓
計算10/14文章詞頻
word_count_uber_1014 <- data_ubereats %>%
filter(artDate == as.Date("2019/10/14")) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count > 10) %>%
arrange(desc(count))
word_count_uber_1014 %>% wordcloud2()
計算所有字在文集中的總詞頻
word_count_uber <- data_ubereats %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count > 50) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count_uber %>% wordcloud2()
# 正向字典txt檔
# 以,將字分隔
P <- read_file("./positive.txt")
# 負向字典txt檔
N <- read_file("./negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
#以LIWC情緒字典分析
sentiment_count = data_ubereats %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_count %>%
arrange(desc(count))
## # A tibble: 180 x 3
## # Groups: artDate [104]
## artDate sentiment count
## <date> <fct> <int>
## 1 2019-10-14 negative 95
## 2 2019-10-15 negative 66
## 3 2019-10-14 positive 61
## 4 2019-10-15 positive 60
## 5 2019-10-13 negative 52
## 6 2019-10-24 positive 47
## 7 2019-10-13 positive 30
## 8 2019-10-18 negative 27
## 9 2019-10-26 negative 23
## 10 2019-10-18 positive 21
## # ... with 170 more rows
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2019/10/14'))
[1]])),colour = "red")
#geom_vline畫出vertical line,xintercept告訴他要在artDate欄位的哪一個row畫線
# 以每篇文章爲單位,計算每個詞彙在的tf-idf值
ubereat_word <- data_ubereats %>%
select(artUrl, word, count)
ubereat_tf_idf <- ubereat_word %>%
bind_tf_idf(word, artUrl, count)
# 選出每篇文章,tf-idf最大的十個詞
ubereat_tf_idf %>%
group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl))
## Selecting by tf_idf
## # A tibble: 3,227 x 6
## # Groups: artUrl [269]
## artUrl word count tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 https://www.ptt.cc/bbs/Gossiping/M.158746226~ 訂單 4 0.121 2.23 0.270
## 2 https://www.ptt.cc/bbs/Gossiping/M.158746226~ 取消 4 0.121 2.89 0.350
## 3 https://www.ptt.cc/bbs/Gossiping/M.158746226~ 馬上 2 0.0606 3.40 0.206
## 4 https://www.ptt.cc/bbs/Gossiping/M.158746226~ 重新 2 0.0606 4.90 0.297
## 5 https://www.ptt.cc/bbs/Gossiping/M.158746226~ 送出去~ 1 0.0303 5.59 0.170
## 6 https://www.ptt.cc/bbs/Gossiping/M.158746226~ 錯誤 1 0.0303 5.59 0.170
## 7 https://www.ptt.cc/bbs/Gossiping/M.158746226~ 結帳 1 0.0303 5.59 0.170
## 8 https://www.ptt.cc/bbs/Gossiping/M.158746226~ 扣款 1 0.0303 5.59 0.170
## 9 https://www.ptt.cc/bbs/Gossiping/M.158746226~ 打去 1 0.0303 4.90 0.149
## 10 https://www.ptt.cc/bbs/Gossiping/M.158746226~ 開完 1 0.0303 5.59 0.170
## # ... with 3,217 more rows
# 選每篇文章,tf-idf最大的十個詞,
# 並查看每個詞被選中的次數
ubereat_tf_idf %>%
group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl)) %>%
ungroup() %>%
count(word, sort=TRUE)
## Selecting by tf_idf
## # A tibble: 2,706 x 2
## word n
## <chr> <int>
## 1 業者 15
## 2 店家 10
## 3 勞動部 10
## 4 關係 10
## 5 PO 8
## 6 僱傭 8
## 7 承攬 7
## 8 雇傭 7
## 9 foodpanda 6
## 10 外送員 6
## # ... with 2,696 more rows
將同一篇的斷詞整理在一起
ubereats_sentence = data_ubereats %>%
group_by(artUrl) %>%
summarise(sentence = paste0(word, collapse = ""))
# remove stopwords
jieba_tokenizer = worker()
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
bigram分詞
ubereat_bigram <- ubereats_sentence %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
ubereat_bigram
## # A tibble: 21,915 x 2
## artUrl bigram
## <chr> <chr>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 店家 外送員
## 2 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 外送員 排店
## 3 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 排店 編號
## 4 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 編號 店員
## 5 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 店員 受害人
## 6 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 受害人 羅先生
## 7 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 羅先生 東森
## 8 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 東森 總公司
## 9 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 總公司 取消
## 10 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 取消 這張
## # ... with 21,905 more rows
ubereat_bigram %>%
count(bigram, sort = TRUE)
## # A tibble: 20,245 x 2
## bigram n
## <chr> <int>
## 1 連結 網址 75
## 2 媒體 來源 72
## 3 外送 平台 67
## 4 記者 署名 33
## 5 報導 新聞標題 18
## 6 台北 報導 17
## 7 美食 外送 14
## 8 署名 新聞標題 13
## 9 網址 story 13
## 10 熊貓 ubereats 12
## # ... with 20,235 more rows
ubereat_bigram
## # A tibble: 21,915 x 2
## artUrl bigram
## <chr> <chr>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 店家 外送員
## 2 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 外送員 排店
## 3 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 排店 編號
## 4 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 編號 店員
## 5 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 店員 受害人
## 6 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 受害人 羅先生
## 7 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 羅先生 東森
## 8 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 東森 總公司
## 9 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 總公司 取消
## 10 https://www.ptt.cc/bbs/Gossiping/M.1567407974.A.E30.html 取消 這張
## # ... with 21,905 more rows
# 計算兩個詞彙同時出現的總次數
word_pairs <- ubereat_word %>%
pairwise_count(word, artUrl, sort = TRUE)
# 計算兩個詞彙間的相關性
word_cors <- ubereat_word %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
# 與UberEats相關性高的詞彙
word_cors %>%
filter(item1 == "UberEats") %>%
head(10)
## # A tibble: 10 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 UberEats 熊貓 0.127
## 2 UberEats 是不是 0.111
## 3 UberEats foodpanda 0.0955
## 4 UberEats 這麼 0.0911
## 5 UberEats 網友 0.0890
## 6 UberEats 客人 0.0847
## 7 UberEats 剛剛 0.0847
## 8 UberEats 遇到 0.0847
## 9 UberEats 直接 0.0760
## 10 UberEats 不是 0.0760
# 設定幾個詞做爲seed words
seed_words <- c("udn","網址","許多","www")
# 設定threshold爲0.5
threshold <- 0.5
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
.$item1 %>%
unique()
remove_words
## [1] "網址" "新聞標題" "來源" "連結" "署名" "備註"
## [7] "媒體" "內文" "記者" "表示"
# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
filter(!(item1 %in% remove_words|item2 %in% remove_words))
word_cors_new %>%
filter(correlation > 0.4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
肺炎爆發前
ubereats_before <- data_ubereats %>%
filter(data_ubereats$artDate < as.Date("2020/01/21"))
查看每天篇數
ub_before_num <- ubereats_before %>%
group_by(artDate) %>%
summarise(count = n()) %>%
arrange(desc(count))
ub_before_num
## # A tibble: 90 x 2
## artDate count
## <date> <int>
## 1 2019-10-14 3077
## 2 2019-10-15 1715
## 3 2019-10-13 1498
## 4 2019-10-24 824
## 5 2019-10-18 677
## 6 2019-10-16 615
## 7 2019-09-26 467
## 8 2019-10-23 464
## 9 2019-10-17 409
## 10 2019-09-10 388
## # ... with 80 more rows
統計每天的文章正面字的次數與負面字的次數
sentiment_count = ubereats_before %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_count %>%
arrange(desc(count))
## # A tibble: 136 x 3
## # Groups: artDate [77]
## artDate sentiment count
## <date> <fct> <int>
## 1 2019-10-14 negative 95
## 2 2019-10-15 negative 66
## 3 2019-10-14 positive 61
## 4 2019-10-15 positive 60
## 5 2019-10-13 negative 52
## 6 2019-10-24 positive 47
## 7 2019-10-13 positive 30
## 8 2019-10-18 negative 27
## 9 2019-10-26 negative 23
## 10 2019-10-18 positive 21
## # ... with 126 more rows
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
ubereats_after <- data_ubereats %>%
filter(data_ubereats$artDate >= as.Date("2020/01/21"))
查看每天篇數
ub_after_num <- ubereats_after %>%
group_by(artDate) %>%
summarise(count = n()) %>%
arrange(desc(count))
ub_after_num
## # A tibble: 34 x 2
## artDate count
## <date> <int>
## 1 2020-03-28 435
## 2 2020-04-18 255
## 3 2020-04-12 224
## 4 2020-04-15 206
## 5 2020-04-10 196
## 6 2020-04-21 170
## 7 2020-01-26 146
## 8 2020-04-17 145
## 9 2020-02-15 126
## 10 2020-04-16 95
## # ... with 24 more rows
統計每天的文章正面字的次數與負面字的次數
sentiment_count = ubereats_after %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_count %>%
arrange(desc(count))
## # A tibble: 44 x 3
## # Groups: artDate [27]
## artDate sentiment count
## <date> <fct> <int>
## 1 2020-03-28 positive 16
## 2 2020-04-10 positive 14
## 3 2020-04-18 negative 13
## 4 2020-04-12 negative 11
## 5 2020-04-21 negative 11
## 6 2020-03-28 negative 8
## 7 2020-04-17 negative 8
## 8 2020-04-10 negative 7
## 9 2020-04-17 positive 7
## 10 2020-04-15 negative 6
## # ... with 34 more rows
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/03/28'))
[1]])),colour = "red")
#geom_vline畫出vertical line,xintercept告訴他要在artDate欄位的哪一個row畫線
data_ubereats %>%
filter(artDate == as.Date('2020/03/28')) %>%
distinct(artUrl, .keep_all = TRUE) %>%
select(artTitle)
## artTitle
## 1 [新聞]今年前二月機車肇事增加外送平台佔4.25%
## 2 [新聞]寧夏夜市靠外送突圍下一步要走向社交電
library(reshape2)
data_ubereats %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("royalblue1", "royalblue4"), max.words = 100)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector