Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(wordcloud2)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(wordcloud)
## Loading required package: RColorBrewer
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(readr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
##資料來源:PTT八卦版,關鍵字「口罩2.0」
csv = fread('mask_articleMetaData.csv',encoding = 'UTF-8')
csv$artDate= csv$artDate %>% as.Date("%Y/%m/%d")
head(csv)
## artTitle
## 1: [新聞]政院研擬「口罩實名制2.0」最快下周四上
## 2: [問卦]口罩2.0的掛?
## 3: [新聞]口罩實名制2.0將試辦網路販售最快12日上
## 4: [新聞]行政院擬「口罩實名制2.0」試辦網路販售
## 5: [新聞]口罩2.0拍板:網路購買、超商取貨 今做壓力測試...物流系統變更還需
## 6: [新聞]口罩實名制2.0可網購口罩指揮中心:運費
## artDate artTime artUrl
## 1: 2020-03-06 09:11:39 https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html
## 2: 2020-03-07 06:56:49 https://www.ptt.cc/bbs/Gossiping/M.1583564211.A.6CE.html
## 3: 2020-03-08 03:31:24 https://www.ptt.cc/bbs/Gossiping/M.1583638286.A.D34.html
## 4: 2020-03-08 04:46:49 https://www.ptt.cc/bbs/Gossiping/M.1583642811.A.BA2.html
## 5: 2020-03-09 05:22:52 https://www.ptt.cc/bbs/Gossiping/M.1583731374.A.16A.html
## 6: 2020-03-09 08:34:58 https://www.ptt.cc/bbs/Gossiping/M.1583742901.A.146.html
## artPoster artCat commentNum push boo
## 1: rexlin Gossiping 619 362 31
## 2: kbt2720 Gossiping 53 7 5
## 3: currykukuo Gossiping 183 94 3
## 4: Moogle Gossiping 178 95 11
## 5: blueadam193 Gossiping 911 443 52
## 6: sukiyasuica Gossiping 307 132 22
## sentence
## 1: 1.媒體來源:\n\nUDN\n\n2.記者署名\n\n陳熙文\n\n3.完整新聞標題:\n\n政院研擬「口罩實名制2.0」 最快下周四上路\n\n4.完整新聞內文:\n\n新冠肺炎疫情延燒,導致口罩供需成為焦點。我國適時控管流量,施行「實名制」購買,\n受到外界讚賞。針對進一步沿革,行政院副院長陳其邁今表示,已由政委唐鳳和其團隊在\n努力分析健保資料庫大數據,將於周一整理要點。據了解,若討論出結果,最快下周四可\n以上路。\n\n陳其邁昨於臉書上表示,行政院長蘇貞昌,與各部會5日在院會針對新冠肺炎議題討論超\n前部署,他也和唐鳳討論「口罩實名制2.0」政策,思考口罩實名制再精進的方法,同時\n也與衛生部長陳時中討論強化基層診所、藥師投入防疫的能量。\n\n對於具體做法,陳其邁表示,實名制的口罩發放,除了結合我國健保的資料庫,還有各健\n保藥局的分布,這段時間已累積一些資料,例如:哪一些藥局發放比較多、哪一些藥局發\n放比較少、哪一些是屬於熱點等。對於相關大數據資訊的分析,包括人口分布、熱點分布\n,行政院在思考,如何能夠讓民眾以更方便的方式獲取口罩。\n\n據了解,除了增加實體通路,也考慮虛擬通路等,但具體細節要等周一討論後才有結果。\n陳其邁指出,現由唐鳳和其團隊在進行分析,希望周一會有好消息。據悉,由於現行口罩\n實名制制度每周四滾動式檢討,若研擬新辦法,最快也要等到3月12日才會上路。\n\n5.完整新聞連結 (或短網址):\nhttps://udn.com/news/story/120958/4394302\n6.備註:\n\n可以開放網購嗎?\n
## 2: 大家覺得口罩2.0是什麼??\n\n怎麼感覺就是賣的好的藥局塞更多口罩?\n\n600片變成1000片或2000片之類的\n\n然後藥局更累這樣\n民眾歡呼\n\n\n\n根本不對這國家抱有希望= =\n
## 3: 1.媒體來源:\n\n中央\n\n2.記者署名\n\n顧荃\n\n3.完整新聞標題:\n口罩實名制2.0將試辦網路販售 最快12日上路\n4.完整新聞內文:\n\n(中央社記者顧荃台北8日電)\n行政院研議「口罩實名制2.0」,將試辦網路販售,民眾不\n用再到藥局排隊買口罩。\n細節將待明天行政院長蘇貞昌拍板後對外宣布,最快可望12日上\n路。\n\n因應武漢肺炎,2月6日起實施購買口罩實名制,民眾買口罩須持健保卡到健保特約藥局購\n買,成人口罩7天內限購3片,兒童口罩7天內5片。\n\n由於口罩產能提升,行政院副院長陳其邁日前表示,目前政委唐鳳及同仁正加緊努力,分\n析健保資料庫大數據,思考如何讓民眾更便利取得口罩。\n\n政院官員說,3月4日起政院已啟動「口罩實名制2.0」相關方案的討論,將根據過去的銷\n售紀錄、健保資料庫中累積的大數據,對於出現較多存貨的藥局,在送貨上調降或暫緩,\n讓多出來的口罩來做更有效率的出售。\n\n政院官員表示,因此\n這些多出來的口罩,將試辦以網路販售,讓民眾買口罩更便利。至於\n網路販售的方式與細節,將待明天蘇貞昌拍板定案後,再對外宣布,最快12日上路。\n(編\n輯:林興盟)1090308\n\n5.完整新聞連結 (或短網址):\nhttps://www.cna.com.tw/news/firstnews/202003080046.aspx\n6.備註:
## 4: 1.媒體來源:\n\nEttoday\n\n2.記者署名\n\n陶本和\n\n3.完整新聞標題:\n\n行政院擬「口罩實名制2.0」試辦網路販售 最快12日上路\n\n4.完整新聞內文:\n\n行政院正在研擬「口罩實名制2.0」,屆時將試辦網路販售。對此,行政院長蘇貞昌9日將\n討論細節,待拍板後對外宣布,最快可望於12日上路。\n由於源自中國武漢的新冠肺炎疫情持續延燒,2月6日起實施購買口罩實名制,民眾買口罩\n須持健保卡到健保特約藥局購買,成人口罩7天內限購3片,兒童口罩7天內5片。\n\n儘管近期口罩產量提升,但仍在研擬如何讓民眾更能購買到口罩,且不需到藥局排隊。對\n此,政院官員指出,3月4日起政院已啟動「口罩實名制2.0」相關方案的討論。\n據指出,屆時會根據過去的銷售紀錄、健保資料庫中累積的大數據,對於出現較多存貨的\n藥局,在送貨上調降或暫緩,讓多出來的口罩來做更有效率的出售。\n\n行政院官員表示,這些多出來的口罩,將試辦以網路販售,讓民眾買口罩更便利。至於網\n路販售的方式與細節,將待明天蘇貞昌拍板定案後,再對外宣布,且為了避免防疫期間資\n訊混亂,將統一由中央疫情指揮中心宣布,時程上9日公布分案,最快12日上路。\n\n5.完整新聞連結 (或短網址):\nhttps://www.ettoday.net/news/20200308/1662525.htm?from=fb_et_news\n6.備註:\n快快快!\n我們一起來看看\n率先收割!
## 5: 1.媒體來源:\n\n\nETtoday新聞雲\n\n\n2.記者署名\n\n\n記者陶本和/台北報導\n\n\n3.完整新聞標題:\n\n\n口罩2.0拍板:網路購買、超商取貨 今做壓力測試...物流系統變更還需1週\n\n\n4.完整新聞內文:\n\n\n口罩實名制確定進化到2.0版,行政院長蘇貞昌9日中午召集會議拍板,將推動網路購買、\n\n超商取貨,系統初步測試可行,超商也願配合,今天將會做一整天的壓力測試,由於超商\n\n物流系統變更需一周,預計本週末前展開登記測試,下下週前開始領口罩。\n\n\n據了解,蘇貞昌中午召集會議,敲定推動口罩實名制2.0版,推動「網路購買、超商取貨\n\n」,此方法除了減輕藥局的負擔,也能讓一般上班族有辦法透過系統購買。\n\n\n據了解,目前系統初步測試可行,超商也願意配合,今天將進行一整天的壓力測試,而超\n\n商物流系統變更大概需要一週時間,因此希望能夠在本週末前展開登記測試,下下週前開\n\n始領口罩。\n\n\n5.完整新聞連結 (或短網址):\nhttps://www.ettoday.net/news/20200309/1663197.htm\n6.備註:\n
## 6: 1.媒體來源:自由\n\n\n2.記者署名:林惠琴\n\n\n3.完整新聞標題:口罩實名制2.0可網購口罩 指揮中心:運費不會比口罩還貴\n\n\n4.完整新聞內文:\n〔記者林惠琴/台北報導〕「口罩實名制2.0」即將上路,中央流行疫情指揮中心指揮官\n陳時中今日表示,原本統計口罩以5歲至15歲、65歲以上購買較多,因應其他族群需求而\n增加網路購買管道,但運費目前討論須自付,強調基本上已有方向,但尚未確定,待今日\n進行測試後,預計明日報告細節。\n\n指揮中心醫療整備官羅一鈞說明,上週四實名制口罩增加為成人3片、兒童5片,實施頭2\n日的買氣增加,銷售率平均超過100%,顯示民眾對於口罩需求仍高。\n\n羅一鈞也指出,針對未來口罩實名制2.0的網路購買方案,可購買的入口網站不會只有1個\n,可取貨的超商也不會只有1家;至於運費,民眾可能會擔心買幾片口罩就要花50、60元\n的運費不划算,對此,他強調網購口罩的運費不會比口罩的費用貴。\n\n以現在口罩每片5元,成人一次買3片15元、兒童一次買5片25元來說,運費應該不會高於\n相關費用,但具體的收費內容,羅一鈞指出,仍待與相關單位討論後而定。\n\n\n5.完整新聞連結 (或短網址):\nhttps://reurl.cc/xZDqe1\n6.備註:
data <- csv %>%
select(artDate, artUrl) %>%
distinct()
article_count_by_date <- data %>%
group_by(artDate) %>%
summarise(count = n())
head(article_count_by_date, 20)
## # A tibble: 14 x 2
## artDate count
## <date> <int>
## 1 2020-03-06 1
## 2 2020-03-07 1
## 3 2020-03-08 2
## 4 2020-03-09 3
## 5 2020-03-10 5
## 6 2020-03-11 7
## 7 2020-03-12 4
## 8 2020-03-13 1
## 9 2020-03-15 3
## 10 2020-03-16 2
## 11 2020-03-17 1
## 12 2020-03-18 2
## 13 2020-03-19 1
## 14 2020-03-20 1
plot_date <-
# data
article_count_by_date %>%
# aesthetics
ggplot(aes(x = artDate, y = count)) +
# geometrics
geom_line(color = "#00AFBB", size = 2) +
geom_vline(xintercept = as.numeric(as.Date("2020-03-11")), col='red') +
# coordinates
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("武漢肺炎 討論文章數") +
xlab("日期") +
ylab("數量") +
# theme
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
plot_date
# 初始化斷詞引擎
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
new_user_word(jieba_tokenizer, c("李來希", "武漢肺炎", "新冠肺炎", "口罩2.0", "口罩實名制2.0"))
## [1] TRUE
# 自定義斷詞函式
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
mask_tokens <- csv %>% unnest_tokens(word, sentence, token=chi_tokenizer)
str(mask_tokens)
## Classes 'data.table' and 'data.frame': 6210 obs. of 10 variables:
## $ artTitle : chr "[新聞]政院研擬「口罩實名制2.0」最快下周四上" "[新聞]政院研擬「口罩實名制2.0」最快下周四上" "[新聞]政院研擬「口罩實名制2.0」最快下周四上" "[新聞]政院研擬「口罩實名制2.0」最快下周四上" ...
## $ artDate : Date, format: "2020-03-06" "2020-03-06" ...
## $ artTime : chr "09:11:39" "09:11:39" "09:11:39" "09:11:39" ...
## $ artUrl : chr "https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html" "https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html" "https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html" "https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html" ...
## $ artPoster : chr "rexlin" "rexlin" "rexlin" "rexlin" ...
## $ artCat : chr "Gossiping" "Gossiping" "Gossiping" "Gossiping" ...
## $ commentNum: int 619 619 619 619 619 619 619 619 619 619 ...
## $ push : int 362 362 362 362 362 362 362 362 362 362 ...
## $ boo : int 31 31 31 31 31 31 31 31 31 31 ...
## $ word : chr "1." "媒體" "來源" "udn" ...
## - attr(*, ".internal.selfref")=<externalptr>
mask_tokens
## artTitle artDate artTime
## 1: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 2: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 3: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 4: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 5: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## ---
## 6206: [問卦]有人用口罩2.0買只是為了紀念嗎? 2020-03-20 00:51:27
## 6207: [問卦]有人用口罩2.0買只是為了紀念嗎? 2020-03-20 00:51:27
## 6208: [問卦]有人用口罩2.0買只是為了紀念嗎? 2020-03-20 00:51:27
## 6209: [問卦]有人用口罩2.0買只是為了紀念嗎? 2020-03-20 00:51:27
## 6210: [問卦]有人用口罩2.0買只是為了紀念嗎? 2020-03-20 00:51:27
## artUrl artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 2: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 3: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 4: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 5: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## ---
## 6206: https://www.ptt.cc/bbs/Gossiping/M.1584665489.A.FB7.html serenemind
## 6207: https://www.ptt.cc/bbs/Gossiping/M.1584665489.A.FB7.html serenemind
## 6208: https://www.ptt.cc/bbs/Gossiping/M.1584665489.A.FB7.html serenemind
## 6209: https://www.ptt.cc/bbs/Gossiping/M.1584665489.A.FB7.html serenemind
## 6210: https://www.ptt.cc/bbs/Gossiping/M.1584665489.A.FB7.html serenemind
## artCat commentNum push boo word
## 1: Gossiping 619 362 31 1.
## 2: Gossiping 619 362 31 媒體
## 3: Gossiping 619 362 31 來源
## 4: Gossiping 619 362 31 udn
## 5: Gossiping 619 362 31 2.
## ---
## 6206: Gossiping 13 7 0 武漢肺炎
## 6207: Gossiping 13 7 0 有沒有
## 6208: Gossiping 13 7 0 買回來
## 6209: Gossiping 13 7 0 紀念
## 6210: Gossiping 13 7 0 的呢
head(mask_tokens, 20)
## artTitle artDate artTime
## 1: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 2: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 3: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 4: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 5: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 6: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 7: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 8: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 9: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 10: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 11: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 12: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 13: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 14: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 15: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 16: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 17: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 18: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 19: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 20: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## artUrl artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 2: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 3: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 4: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 5: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 6: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 7: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 8: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 9: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 10: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 11: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 12: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 13: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 14: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 15: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 16: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 17: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 18: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 19: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## 20: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html rexlin
## artCat commentNum push boo word
## 1: Gossiping 619 362 31 1.
## 2: Gossiping 619 362 31 媒體
## 3: Gossiping 619 362 31 來源
## 4: Gossiping 619 362 31 udn
## 5: Gossiping 619 362 31 2.
## 6: Gossiping 619 362 31 記者
## 7: Gossiping 619 362 31 署名
## 8: Gossiping 619 362 31 熙文
## 9: Gossiping 619 362 31 3.
## 10: Gossiping 619 362 31 完整
## 11: Gossiping 619 362 31 新聞標題
## 12: Gossiping 619 362 31 政院
## 13: Gossiping 619 362 31 研擬
## 14: Gossiping 619 362 31 口罩實名制2.0
## 15: Gossiping 619 362 31 最快
## 16: Gossiping 619 362 31 周四
## 17: Gossiping 619 362 31 上路
## 18: Gossiping 619 362 31 4.
## 19: Gossiping 619 362 31 完整
## 20: Gossiping 619 362 31 新聞
過濾特殊字元
mask_tokens = mask_tokens %>%
filter(!grepl('_',word))
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
mask_data <- mask_tokens %>%
filter(nchar(.$word)>1) %>%
group_by_all() %>%
summarise(count = n())
# 印出最常見的20個詞彙
head(mask_data, 20)
## # A tibble: 20 x 11
## # Groups: artTitle, artDate, artTime, artUrl, artPoster, artCat, commentNum,
## # push, boo [1]
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <chr> <chr> <chr> <chr> <int> <int> <int>
## 1 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 2 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 3 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 4 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 5 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 6 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 7 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 8 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 9 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 10 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 11 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 12 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 13 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 14 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 15 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 16 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 17 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 18 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 19 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 20 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## # … with 2 more variables: word <chr>, count <int>
mask_data
## # A tibble: 4,343 x 11
## # Groups: artTitle, artDate, artTime, artUrl, artPoster, artCat, commentNum,
## # push, boo [34]
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <chr> <chr> <chr> <chr> <int> <int> <int>
## 1 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 2 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 3 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 4 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 5 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 6 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 7 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 8 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 9 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 10 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## # … with 4,333 more rows, and 2 more variables: word <chr>, count <int>
word_count <- mask_data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
## Adding missing grouping variables: `artTitle`, `artDate`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentNum`, `push`, `boo`
word_count
## # A tibble: 387 x 2
## word count
## <chr> <int>
## 1 口罩 197
## 2 預購 96
## 3 完整 75
## 4 口罩實名制2.0 68
## 5 購買 61
## 6 新聞 55
## 7 民眾 44
## 8 記者 40
## 9 指揮中心 40
## 10 藥局 39
## # … with 377 more rows
全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版
# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
#字典txt檔讀進來是一個字串
typeof(P)
## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
文集中的字出現在LIWC字典中是屬於positive還是negative
word_count %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 17 x 3
## word count sentiment
## <chr> <int> <fct>
## 1 流行 19 positive
## 2 成功 15 positive
## 3 問題 12 negative
## 4 希望 9 positive
## 5 決定 7 positive
## 6 簡單 5 positive
## 7 確定 5 positive
## 8 順利 5 positive
## 9 優惠 5 positive
## 10 自由 5 positive
## 11 改善 4 positive
## 12 隔離 4 negative
## 13 健康 4 positive
## 14 解決 4 positive
## 15 榮譽 4 positive
## 16 效率 4 positive
## 17 壓力 4 negative
mask_data %>%
select(word) %>%
inner_join(LIWC)
## Adding missing grouping variables: `artTitle`, `artDate`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentNum`, `push`, `boo`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 156 x 11
## # Groups: artTitle, artDate, artTime, artUrl, artPoster, artCat, commentNum,
## # push, boo [32]
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <chr> <chr> <chr> <chr> <int> <int> <int>
## 1 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 2 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 3 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 4 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 5 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 6 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 7 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 8 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 9 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## 10 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic Gossi… 135 59 4
## # … with 146 more rows, and 2 more variables: word <chr>, sentiment <fct>
#以LIWC情緒字典分析
sentiment_count = mask_data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
## Adding missing grouping variables: `artTitle`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentNum`, `push`, `boo`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/03/11'))
[1]])),colour = "red")
#geom_vline畫出vertical line,xintercept告訴他要在artDate欄位的哪一個row畫線
mask_data %>% filter(artDate == as.Date('2020/03/11')) %>% distinct(artUrl, .keep_all = TRUE)
## # A tibble: 7 x 11
## # Groups: artTitle, artDate, artTime, artUrl, artPoster, artCat, commentNum,
## # push, boo [7]
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <chr> <chr> <chr> <chr> <int> <int> <int>
## 1 [問卦]明天口… 2020-03-11 13:24:… https… LIN9 Gossi… 17 8 1
## 2 [新聞]店員又… 2020-03-11 03:23:… https… hy543 Gossi… 203 69 19
## 3 [新聞]堅持口… 2020-03-11 06:06:… https… yankeeha… Gossi… 73 13 30
## 4 [新聞]口罩實… 2020-03-11 03:28:… https… johnson2… Gossi… 93 13 50
## 5 [新聞]口罩實… 2020-03-11 23:09:… https… zzyyxx77 Gossi… 177 20 95
## 6 [新聞]口罩實… 2020-03-11 01:07:… https… inglee Gossi… 218 25 129
## 7 [新聞]批口罩… 2020-03-11 16:41:… https… tibo96033 Gossi… 154 12 87
## # … with 2 more variables: word <chr>, count <int>
mask_data %>%
filter(artDate == as.Date('2020/03/11')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>10) %>% # 過濾出現太少次的字
wordcloud2()
## Adding missing grouping variables: `artTitle`, `artDate`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentNum`, `push`, `boo`
哪篇文章的負面情緒最多?負面情緒的字是?
mask_data %>%
filter(artDate == as.Date('2020/03/11')) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 6 x 4
## # Groups: artUrl [6]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossipi… negative [新聞]口罩實名制2.0「加7元物流費」李來希狂轟… 5
## 2 https://www.ptt.cc/bbs/Gossipi… negative [新聞]口罩實名制2.0付7元運費 李來希轟:把人民當腦殘… 4
## 3 https://www.ptt.cc/bbs/Gossipi… negative [新聞]堅持口罩2.0不該收7元運費陳玉珍:這不… 3
## 4 https://www.ptt.cc/bbs/Gossipi… negative [新聞]口罩實名制2.0多7元李來希:把人民當腦… 3
## 5 https://www.ptt.cc/bbs/Gossipi… negative [新聞]店員又要崩潰...口罩2.0「超商排隊大打… 2
## 6 https://www.ptt.cc/bbs/Gossipi… negative [新聞]批口罩實名制2.0李來希不滿:要先學會上… 1
mask_data %>%
filter(artDate == as.Date('2020/03/11')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
觀察前後一天的狀況
mask_data %>%
filter(artDate %in% c(as.Date('2020/03/10'))) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 5 x 4
## # Groups: artUrl [5]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossipi… negative [爆卦]口罩實名制2.0、新增兩例確診… 8
## 2 https://www.ptt.cc/bbs/Gossipi… negative [新聞]口罩實名制2.0周四上路線上預購每筆自付7元物流費… 3
## 3 https://www.ptt.cc/bbs/Gossipi… negative [新聞]口罩2.0要運費 陳玉珍竟嗆:這是小錢沒必要… 2
## 4 https://www.ptt.cc/bbs/Gossipi… negative [新聞]口罩實名制2.0開會影片曝光 蘇貞昌:我… 1
## 5 https://www.ptt.cc/bbs/Gossipi… negative [新聞]口罩實名制2.0銀行局:三種付款皆免手… 1
mask_data %>%
filter(artDate %in% c(as.Date('2020/03/12'))) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 1 x 4
## # Groups: artUrl [1]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M… negative [新聞]口罩2.0上路大當機「比高鐵票還難搶」… 2
mask_data %>%
filter(artDate == as.Date('2020/03/10')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
mask_data %>%
filter(artDate == as.Date('2020/03/12')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
將同一篇的斷詞整理在一起
mask_data_full = mask_data %>%
group_by(artUrl, artDate) %>%
summarise(sentence = paste0(word, collapse = " "))
mask_data_full
## # A tibble: 34 x 3
## # Groups: artUrl [34]
## artUrl artDate sentence
## <chr> <date> <chr>
## 1 https://www.ptt.cc/bbs/Goss… 2020-03-06 1. 12 120958 2. 3. 4. 4394302 5. 6. …
## 2 https://www.ptt.cc/bbs/Goss… 2020-03-07 1000 2000 600 變成 感覺 根本 更累 國家 歡呼 覺得 口…
## 3 https://www.ptt.cc/bbs/Goss… 2020-03-08 1. 1090308 12 2. 202003080046. 3. 4.…
## 4 https://www.ptt.cc/bbs/Goss… 2020-03-08 1. 12 1662525. 2. 20200308 3. 4. 5. …
## 5 https://www.ptt.cc/bbs/Goss… 2020-03-09 1. 1663197. 2. 2.0 20200309 3. 4. 5.…
## 6 https://www.ptt.cc/bbs/Goss… 2020-03-09 1. 100 15 2. 25 3. 4. 5. 50 6. 60 65…
## 7 https://www.ptt.cc/bbs/Goss… 2020-03-09 1. 15 2. 202003095006. 3. 4. 5. 6. 包…
## 8 https://www.ptt.cc/bbs/Goss… 2020-03-10 0800-001922 1. 10 103 12 13 136 14 1…
## 9 https://www.ptt.cc/bbs/Goss… 2020-03-10 1. 12 120958 18 19 2. 233 26 3. 4. 4…
## 10 https://www.ptt.cc/bbs/Goss… 2020-03-10 10 135 15 246 704697 包括 抱怨 報導 本來 比較 …
## # … with 24 more rows
#李來希
mask_comment = fread('Boomer_articleMetaData.csv',encoding = 'UTF-8')
head(mask_comment)
## artTitle artDate artTime
## 1: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## 2: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## 3: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## 4: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## 5: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## 6: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## artUrl artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 2: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 3: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 4: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 5: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 6: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## artCat commentPoster commentStatus commentDate
## 1: Gossiping BlueBird5566 噓 2020-03-08 14:02:00
## 2: Gossiping kodak2222 噓 2020-03-08 14:03:00
## 3: Gossiping mkopin 噓 2020-03-08 14:03:00
## 4: Gossiping kutkin 推 2020-03-08 14:03:00
## 5: Gossiping hihi29 推 2020-03-08 14:03:00
## 6: Gossiping Sougetu 推 2020-03-08 14:03:00
## commentContent
## 1: :台灣特色的綠色共產主義你懂個屁
## 2: :滾
## 3: :白痴閉嘴
## 4: :重點配給還是不夠用
## 5: :一樓柯糞好了啦
## 6: :推一樓
#調整日期格式
mask_comment$artDate= mask_comment$artDate %>% as.Date("%Y/%m/%d")
head(mask_comment)
## artTitle artDate artTime
## 1: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## 2: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## 3: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## 4: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## 5: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## 6: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## artUrl artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 2: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 3: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 4: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 5: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 6: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## artCat commentPoster commentStatus commentDate
## 1: Gossiping BlueBird5566 噓 2020-03-08 14:02:00
## 2: Gossiping kodak2222 噓 2020-03-08 14:03:00
## 3: Gossiping mkopin 噓 2020-03-08 14:03:00
## 4: Gossiping kutkin 推 2020-03-08 14:03:00
## 5: Gossiping hihi29 推 2020-03-08 14:03:00
## 6: Gossiping Sougetu 推 2020-03-08 14:03:00
## commentContent
## 1: :台灣特色的綠色共產主義你懂個屁
## 2: :滾
## 3: :白痴閉嘴
## 4: :重點配給還是不夠用
## 5: :一樓柯糞好了啦
## 6: :推一樓
#斷詞
comment_tokens <- mask_comment %>% unnest_tokens(word, commentContent, token=chi_tokenizer)
str(comment_tokens)
## Classes 'data.table' and 'data.frame': 6308 obs. of 10 variables:
## $ artTitle : chr "[新聞]再嗆政府口罩政策李來希酸:搞到要配給" "[新聞]再嗆政府口罩政策李來希酸:搞到要配給" "[新聞]再嗆政府口罩政策李來希酸:搞到要配給" "[新聞]再嗆政府口罩政策李來希酸:搞到要配給" ...
## $ artDate : Date, format: "2020-03-08" "2020-03-08" ...
## $ artTime : chr "06:02:31" "06:02:31" "06:02:31" "06:02:31" ...
## $ artUrl : chr "https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html" "https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html" "https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html" "https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html" ...
## $ artPoster : chr "johnson20524" "johnson20524" "johnson20524" "johnson20524" ...
## $ artCat : chr "Gossiping" "Gossiping" "Gossiping" "Gossiping" ...
## $ commentPoster: chr "BlueBird5566" "BlueBird5566" "BlueBird5566" "BlueBird5566" ...
## $ commentStatus: chr "噓" "噓" "噓" "噓" ...
## $ commentDate : chr "2020-03-08 14:02:00" "2020-03-08 14:02:00" "2020-03-08 14:02:00" "2020-03-08 14:02:00" ...
## $ word : chr "台灣" "特色" "綠色" "共產主義" ...
## - attr(*, ".internal.selfref")=<externalptr>
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
comment_data <- comment_tokens %>%
filter(nchar(.$word)>1) %>%
group_by_all() %>%
summarise(count = n())
# 印出最常見的20個詞彙
head(comment_data, 20)
## # A tibble: 20 x 11
## # Groups: artTitle, artDate, artTime, artUrl, artPoster, artCat,
## # commentPoster, commentStatus, commentDate [8]
## artTitle artDate artTime artUrl artPoster artCat commentPoster
## <chr> <date> <chr> <chr> <chr> <chr> <chr>
## 1 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… a126451026
## 2 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… a126451026
## 3 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… a126451026
## 4 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… a126451026
## 5 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… abacada
## 6 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… acer758219
## 7 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… acserro
## 8 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… acserro
## 9 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… ADEMAIN
## 10 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… ADEMAIN
## 11 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… afternight
## 12 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… afternight
## 13 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… afternight
## 14 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… afternight
## 15 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… afternight
## 16 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… alex0618
## 17 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… alex0618
## 18 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… alex0618
## 19 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… alex0618
## 20 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… alex0618
## # … with 4 more variables: commentStatus <chr>, commentDate <chr>, word <chr>,
## # count <int>
comment_count <- comment_data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
## Adding missing grouping variables: `artTitle`, `artDate`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentPoster`, `commentStatus`, `commentDate`
comment_data %>%
select(word) %>%
inner_join(LIWC)
## Adding missing grouping variables: `artTitle`, `artDate`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentPoster`, `commentStatus`, `commentDate`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 642 x 11
## # Groups: artTitle, artDate, artTime, artUrl, artPoster, artCat,
## # commentPoster, commentStatus, commentDate [576]
## artTitle artDate artTime artUrl artPoster artCat commentPoster
## <chr> <date> <chr> <chr> <chr> <chr> <chr>
## 1 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… acer758219
## 2 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… acserro
## 3 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… angel6502
## 4 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… Ark727
## 5 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… asuka99
## 6 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… banana5566
## 7 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… bardah2c
## 8 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… bengret
## 9 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… carlos5978
## 10 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… chrisjeremy
## # … with 632 more rows, and 4 more variables: commentStatus <chr>,
## # commentDate <chr>, word <chr>, sentiment <fct>
#將同一篇的斷詞整理在一起
comment_data_full = comment_data %>%
group_by(artUrl, artDate) %>%
summarise(sentence = paste0(word, collapse = " "))
# 要
boomer = comment_data_full$artUrl[grepl("李來希", comment_data_full$sentence)]
comment_data %>% filter(artUrl %in% boomer) %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
## Adding missing grouping variables: `artTitle`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentPoster`, `commentStatus`, `commentDate`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 12 x 3
## # Groups: artDate [6]
## artDate sentiment count
## <date> <fct> <int>
## 1 2020-03-08 positive 14
## 2 2020-03-08 negative 31
## 3 2020-03-11 positive 48
## 4 2020-03-11 negative 147
## 5 2020-03-13 positive 56
## 6 2020-03-13 negative 102
## 7 2020-03-15 positive 10
## 8 2020-03-15 negative 22
## 9 2020-03-16 positive 8
## 10 2020-03-16 negative 27
## 11 2020-03-18 positive 39
## 12 2020-03-18 negative 66
comment_data %>% filter(artUrl %in% boomer) %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
## Adding missing grouping variables: `artTitle`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentPoster`, `commentStatus`, `commentDate`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector