Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(wordcloud2)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(wordcloud)
## Loading required package: RColorBrewer
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
library(readr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor

##資料來源:PTT八卦版,關鍵字「口罩2.0」

csv = fread('mask_articleMetaData.csv',encoding = 'UTF-8')
csv$artDate= csv$artDate %>% as.Date("%Y/%m/%d")
head(csv)
##                                                                  artTitle
## 1:                            [新聞]政院研擬「口罩實名制2.0」最快下周四上
## 2:                                                    [問卦]口罩2.0的掛?
## 3:                            [新聞]口罩實名制2.0將試辦網路販售最快12日上
## 4:                            [新聞]行政院擬「口罩實名制2.0」試辦網路販售
## 5: [新聞]口罩2.0拍板:網路購買、超商取貨 今做壓力測試...物流系統變更還需
## 6:                            [新聞]口罩實名制2.0可網購口罩指揮中心:運費
##       artDate  artTime                                                   artUrl
## 1: 2020-03-06 09:11:39 https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html
## 2: 2020-03-07 06:56:49 https://www.ptt.cc/bbs/Gossiping/M.1583564211.A.6CE.html
## 3: 2020-03-08 03:31:24 https://www.ptt.cc/bbs/Gossiping/M.1583638286.A.D34.html
## 4: 2020-03-08 04:46:49 https://www.ptt.cc/bbs/Gossiping/M.1583642811.A.BA2.html
## 5: 2020-03-09 05:22:52 https://www.ptt.cc/bbs/Gossiping/M.1583731374.A.16A.html
## 6: 2020-03-09 08:34:58 https://www.ptt.cc/bbs/Gossiping/M.1583742901.A.146.html
##      artPoster    artCat commentNum push boo
## 1:      rexlin Gossiping        619  362  31
## 2:     kbt2720 Gossiping         53    7   5
## 3:  currykukuo Gossiping        183   94   3
## 4:      Moogle Gossiping        178   95  11
## 5: blueadam193 Gossiping        911  443  52
## 6: sukiyasuica Gossiping        307  132  22
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          sentence
## 1: 1.媒體來源:\n\nUDN\n\n2.記者署名\n\n陳熙文\n\n3.完整新聞標題:\n\n政院研擬「口罩實名制2.0」 最快下周四上路\n\n4.完整新聞內文:\n\n新冠肺炎疫情延燒,導致口罩供需成為焦點。我國適時控管流量,施行「實名制」購買,\n受到外界讚賞。針對進一步沿革,行政院副院長陳其邁今表示,已由政委唐鳳和其團隊在\n努力分析健保資料庫大數據,將於周一整理要點。據了解,若討論出結果,最快下周四可\n以上路。\n\n陳其邁昨於臉書上表示,行政院長蘇貞昌,與各部會5日在院會針對新冠肺炎議題討論超\n前部署,他也和唐鳳討論「口罩實名制2.0」政策,思考口罩實名制再精進的方法,同時\n也與衛生部長陳時中討論強化基層診所、藥師投入防疫的能量。\n\n對於具體做法,陳其邁表示,實名制的口罩發放,除了結合我國健保的資料庫,還有各健\n保藥局的分布,這段時間已累積一些資料,例如:哪一些藥局發放比較多、哪一些藥局發\n放比較少、哪一些是屬於熱點等。對於相關大數據資訊的分析,包括人口分布、熱點分布\n,行政院在思考,如何能夠讓民眾以更方便的方式獲取口罩。\n\n據了解,除了增加實體通路,也考慮虛擬通路等,但具體細節要等周一討論後才有結果。\n陳其邁指出,現由唐鳳和其團隊在進行分析,希望周一會有好消息。據悉,由於現行口罩\n實名制制度每周四滾動式檢討,若研擬新辦法,最快也要等到3月12日才會上路。\n\n5.完整新聞連結 (或短網址):\nhttps://udn.com/news/story/120958/4394302\n6.備註:\n\n可以開放網購嗎?\n
## 2:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            大家覺得口罩2.0是什麼??\n\n怎麼感覺就是賣的好的藥局塞更多口罩?\n\n600片變成1000片或2000片之類的\n\n然後藥局更累這樣\n民眾歡呼\n\n\n\n根本不對這國家抱有希望= =\n
## 3:                                                                                                                                                                                                      1.媒體來源:\n\n中央\n\n2.記者署名\n\n顧荃\n\n3.完整新聞標題:\n口罩實名制2.0將試辦網路販售 最快12日上路\n4.完整新聞內文:\n\n(中央社記者顧荃台北8日電)\n行政院研議「口罩實名制2.0」,將試辦網路販售,民眾不\n用再到藥局排隊買口罩。\n細節將待明天行政院長蘇貞昌拍板後對外宣布,最快可望12日上\n路。\n\n因應武漢肺炎,2月6日起實施購買口罩實名制,民眾買口罩須持健保卡到健保特約藥局購\n買,成人口罩7天內限購3片,兒童口罩7天內5片。\n\n由於口罩產能提升,行政院副院長陳其邁日前表示,目前政委唐鳳及同仁正加緊努力,分\n析健保資料庫大數據,思考如何讓民眾更便利取得口罩。\n\n政院官員說,3月4日起政院已啟動「口罩實名制2.0」相關方案的討論,將根據過去的銷\n售紀錄、健保資料庫中累積的大數據,對於出現較多存貨的藥局,在送貨上調降或暫緩,\n讓多出來的口罩來做更有效率的出售。\n\n政院官員表示,因此\n這些多出來的口罩,將試辦以網路販售,讓民眾買口罩更便利。至於\n網路販售的方式與細節,將待明天蘇貞昌拍板定案後,再對外宣布,最快12日上路。\n(編\n輯:林興盟)1090308\n\n5.完整新聞連結 (或短網址):\nhttps://www.cna.com.tw/news/firstnews/202003080046.aspx\n6.備註:
## 4:                                                                                                                                      1.媒體來源:\n\nEttoday\n\n2.記者署名\n\n陶本和\n\n3.完整新聞標題:\n\n行政院擬「口罩實名制2.0」試辦網路販售 最快12日上路\n\n4.完整新聞內文:\n\n行政院正在研擬「口罩實名制2.0」,屆時將試辦網路販售。對此,行政院長蘇貞昌9日將\n討論細節,待拍板後對外宣布,最快可望於12日上路。\n由於源自中國武漢的新冠肺炎疫情持續延燒,2月6日起實施購買口罩實名制,民眾買口罩\n須持健保卡到健保特約藥局購買,成人口罩7天內限購3片,兒童口罩7天內5片。\n\n儘管近期口罩產量提升,但仍在研擬如何讓民眾更能購買到口罩,且不需到藥局排隊。對\n此,政院官員指出,3月4日起政院已啟動「口罩實名制2.0」相關方案的討論。\n據指出,屆時會根據過去的銷售紀錄、健保資料庫中累積的大數據,對於出現較多存貨的\n藥局,在送貨上調降或暫緩,讓多出來的口罩來做更有效率的出售。\n\n行政院官員表示,這些多出來的口罩,將試辦以網路販售,讓民眾買口罩更便利。至於網\n路販售的方式與細節,將待明天蘇貞昌拍板定案後,再對外宣布,且為了避免防疫期間資\n訊混亂,將統一由中央疫情指揮中心宣布,時程上9日公布分案,最快12日上路。\n\n5.完整新聞連結 (或短網址):\nhttps://www.ettoday.net/news/20200308/1662525.htm?from=fb_et_news\n6.備註:\n快快快!\n我們一起來看看\n率先收割!
## 5:                                                                                                                                                                                                                                                                                                                                                                                             1.媒體來源:\n\n\nETtoday新聞雲\n\n\n2.記者署名\n\n\n記者陶本和/台北報導\n\n\n3.完整新聞標題:\n\n\n口罩2.0拍板:網路購買、超商取貨 今做壓力測試...物流系統變更還需1週\n\n\n4.完整新聞內文:\n\n\n口罩實名制確定進化到2.0版,行政院長蘇貞昌9日中午召集會議拍板,將推動網路購買、\n\n超商取貨,系統初步測試可行,超商也願配合,今天將會做一整天的壓力測試,由於超商\n\n物流系統變更需一周,預計本週末前展開登記測試,下下週前開始領口罩。\n\n\n據了解,蘇貞昌中午召集會議,敲定推動口罩實名制2.0版,推動「網路購買、超商取貨\n\n」,此方法除了減輕藥局的負擔,也能讓一般上班族有辦法透過系統購買。\n\n\n據了解,目前系統初步測試可行,超商也願意配合,今天將進行一整天的壓力測試,而超\n\n商物流系統變更大概需要一週時間,因此希望能夠在本週末前展開登記測試,下下週前開\n\n始領口罩。\n\n\n5.完整新聞連結 (或短網址):\nhttps://www.ettoday.net/news/20200309/1663197.htm\n6.備註:\n
## 6:                                                                                                                                                                                                                                                       1.媒體來源:自由\n\n\n2.記者署名:林惠琴\n\n\n3.完整新聞標題:口罩實名制2.0可網購口罩 指揮中心:運費不會比口罩還貴\n\n\n4.完整新聞內文:\n〔記者林惠琴/台北報導〕「口罩實名制2.0」即將上路,中央流行疫情指揮中心指揮官\n陳時中今日表示,原本統計口罩以5歲至15歲、65歲以上購買較多,因應其他族群需求而\n增加網路購買管道,但運費目前討論須自付,強調基本上已有方向,但尚未確定,待今日\n進行測試後,預計明日報告細節。\n\n指揮中心醫療整備官羅一鈞說明,上週四實名制口罩增加為成人3片、兒童5片,實施頭2\n日的買氣增加,銷售率平均超過100%,顯示民眾對於口罩需求仍高。\n\n羅一鈞也指出,針對未來口罩實名制2.0的網路購買方案,可購買的入口網站不會只有1個\n,可取貨的超商也不會只有1家;至於運費,民眾可能會擔心買幾片口罩就要花50、60元\n的運費不划算,對此,他強調網購口罩的運費不會比口罩的費用貴。\n\n以現在口罩每片5元,成人一次買3片15元、兒童一次買5片25元來說,運費應該不會高於\n相關費用,但具體的收費內容,羅一鈞指出,仍待與相關單位討論後而定。\n\n\n5.完整新聞連結 (或短網址):\nhttps://reurl.cc/xZDqe1\n6.備註:
data <- csv %>% 
  select(artDate, artUrl) %>% 
  distinct()
article_count_by_date <- data %>% 
  group_by(artDate) %>% 
  summarise(count = n())

head(article_count_by_date, 20)
## # A tibble: 14 x 2
##    artDate    count
##    <date>     <int>
##  1 2020-03-06     1
##  2 2020-03-07     1
##  3 2020-03-08     2
##  4 2020-03-09     3
##  5 2020-03-10     5
##  6 2020-03-11     7
##  7 2020-03-12     4
##  8 2020-03-13     1
##  9 2020-03-15     3
## 10 2020-03-16     2
## 11 2020-03-17     1
## 12 2020-03-18     2
## 13 2020-03-19     1
## 14 2020-03-20     1
plot_date <- 
  # data
  article_count_by_date %>% 
  # aesthetics
  ggplot(aes(x = artDate, y = count)) +
  # geometrics
  geom_line(color = "#00AFBB", size = 2) + 
  geom_vline(xintercept = as.numeric(as.Date("2020-03-11")), col='red') + 
  # coordinates
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("武漢肺炎 討論文章數") + 
  xlab("日期") + 
  ylab("數量") + 
  # theme
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。

plot_date

# 初始化斷詞引擎
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
new_user_word(jieba_tokenizer, c("李來希", "武漢肺炎", "新冠肺炎", "口罩2.0", "口罩實名制2.0"))
## [1] TRUE
# 自定義斷詞函式
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}
mask_tokens <- csv %>% unnest_tokens(word, sentence, token=chi_tokenizer)
str(mask_tokens)
## Classes 'data.table' and 'data.frame':   6210 obs. of  10 variables:
##  $ artTitle  : chr  "[新聞]政院研擬「口罩實名制2.0」最快下周四上" "[新聞]政院研擬「口罩實名制2.0」最快下周四上" "[新聞]政院研擬「口罩實名制2.0」最快下周四上" "[新聞]政院研擬「口罩實名制2.0」最快下周四上" ...
##  $ artDate   : Date, format: "2020-03-06" "2020-03-06" ...
##  $ artTime   : chr  "09:11:39" "09:11:39" "09:11:39" "09:11:39" ...
##  $ artUrl    : chr  "https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html" "https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html" "https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html" "https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html" ...
##  $ artPoster : chr  "rexlin" "rexlin" "rexlin" "rexlin" ...
##  $ artCat    : chr  "Gossiping" "Gossiping" "Gossiping" "Gossiping" ...
##  $ commentNum: int  619 619 619 619 619 619 619 619 619 619 ...
##  $ push      : int  362 362 362 362 362 362 362 362 362 362 ...
##  $ boo       : int  31 31 31 31 31 31 31 31 31 31 ...
##  $ word      : chr  "1." "媒體" "來源" "udn" ...
##  - attr(*, ".internal.selfref")=<externalptr>
mask_tokens
##                                          artTitle    artDate  artTime
##    1: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##    2: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##    3: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##    4: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##    5: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##   ---                                                                
## 6206:       [問卦]有人用口罩2.0買只是為了紀念嗎? 2020-03-20 00:51:27
## 6207:       [問卦]有人用口罩2.0買只是為了紀念嗎? 2020-03-20 00:51:27
## 6208:       [問卦]有人用口罩2.0買只是為了紀念嗎? 2020-03-20 00:51:27
## 6209:       [問卦]有人用口罩2.0買只是為了紀念嗎? 2020-03-20 00:51:27
## 6210:       [問卦]有人用口罩2.0買只是為了紀念嗎? 2020-03-20 00:51:27
##                                                         artUrl  artPoster
##    1: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html     rexlin
##    2: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html     rexlin
##    3: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html     rexlin
##    4: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html     rexlin
##    5: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html     rexlin
##   ---                                                                    
## 6206: https://www.ptt.cc/bbs/Gossiping/M.1584665489.A.FB7.html serenemind
## 6207: https://www.ptt.cc/bbs/Gossiping/M.1584665489.A.FB7.html serenemind
## 6208: https://www.ptt.cc/bbs/Gossiping/M.1584665489.A.FB7.html serenemind
## 6209: https://www.ptt.cc/bbs/Gossiping/M.1584665489.A.FB7.html serenemind
## 6210: https://www.ptt.cc/bbs/Gossiping/M.1584665489.A.FB7.html serenemind
##          artCat commentNum push boo     word
##    1: Gossiping        619  362  31       1.
##    2: Gossiping        619  362  31     媒體
##    3: Gossiping        619  362  31     來源
##    4: Gossiping        619  362  31      udn
##    5: Gossiping        619  362  31       2.
##   ---                                       
## 6206: Gossiping         13    7   0 武漢肺炎
## 6207: Gossiping         13    7   0   有沒有
## 6208: Gossiping         13    7   0   買回來
## 6209: Gossiping         13    7   0     紀念
## 6210: Gossiping         13    7   0     的呢
head(mask_tokens, 20)
##                                        artTitle    artDate  artTime
##  1: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##  2: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##  3: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##  4: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##  5: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##  6: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##  7: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##  8: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##  9: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 10: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 11: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 12: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 13: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 14: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 15: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 16: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 17: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 18: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 19: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
## 20: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020-03-06 09:11:39
##                                                       artUrl artPoster
##  1: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
##  2: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
##  3: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
##  4: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
##  5: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
##  6: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
##  7: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
##  8: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
##  9: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 10: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 11: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 12: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 13: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 14: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 15: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 16: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 17: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 18: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 19: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
## 20: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html    rexlin
##        artCat commentNum push boo          word
##  1: Gossiping        619  362  31            1.
##  2: Gossiping        619  362  31          媒體
##  3: Gossiping        619  362  31          來源
##  4: Gossiping        619  362  31           udn
##  5: Gossiping        619  362  31            2.
##  6: Gossiping        619  362  31          記者
##  7: Gossiping        619  362  31          署名
##  8: Gossiping        619  362  31          熙文
##  9: Gossiping        619  362  31            3.
## 10: Gossiping        619  362  31          完整
## 11: Gossiping        619  362  31      新聞標題
## 12: Gossiping        619  362  31          政院
## 13: Gossiping        619  362  31          研擬
## 14: Gossiping        619  362  31 口罩實名制2.0
## 15: Gossiping        619  362  31          最快
## 16: Gossiping        619  362  31          周四
## 17: Gossiping        619  362  31          上路
## 18: Gossiping        619  362  31            4.
## 19: Gossiping        619  362  31          完整
## 20: Gossiping        619  362  31          新聞

過濾特殊字元

mask_tokens = mask_tokens %>% 
  filter(!grepl('_',word))

文字雲

# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
mask_data <- mask_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by_all() %>% 
  summarise(count = n())

# 印出最常見的20個詞彙
head(mask_data, 20)
## # A tibble: 20 x 11
## # Groups:   artTitle, artDate, artTime, artUrl, artPoster, artCat, commentNum,
## #   push, boo [1]
##    artTitle artDate    artTime artUrl artPoster artCat commentNum  push   boo
##    <chr>    <date>     <chr>   <chr>  <chr>     <chr>       <int> <int> <int>
##  1 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  2 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  3 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  4 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  5 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  6 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  7 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  8 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  9 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 10 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 11 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 12 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 13 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 14 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 15 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 16 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 17 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 18 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 19 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 20 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## # … with 2 more variables: word <chr>, count <int>
mask_data
## # A tibble: 4,343 x 11
## # Groups:   artTitle, artDate, artTime, artUrl, artPoster, artCat, commentNum,
## #   push, boo [34]
##    artTitle artDate    artTime artUrl artPoster artCat commentNum  push   boo
##    <chr>    <date>     <chr>   <chr>  <chr>     <chr>       <int> <int> <int>
##  1 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  2 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  3 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  4 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  5 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  6 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  7 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  8 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  9 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 10 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## # … with 4,333 more rows, and 2 more variables: word <chr>, count <int>

計算所有字在文集中的總詞頻

word_count <- mask_data %>%
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))
## Adding missing grouping variables: `artTitle`, `artDate`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentNum`, `push`, `boo`
word_count
## # A tibble: 387 x 2
##    word          count
##    <chr>         <int>
##  1 口罩            197
##  2 預購             96
##  3 完整             75
##  4 口罩實名制2.0    68
##  5 購買             61
##  6 新聞             55
##  7 民眾             44
##  8 記者             40
##  9 指揮中心         40
## 10 藥局             39
## # … with 377 more rows

準備LIWC字典

全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版

以LIWC字典判斷文集中的word屬於正面字還是負面字

# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/liwc/positive.txt")

# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
#字典txt檔讀進來是一個字串
typeof(P)
## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

與LIWC情緒字典做join

文集中的字出現在LIWC字典中是屬於positive還是negative

word_count %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 17 x 3
##    word  count sentiment
##    <chr> <int> <fct>    
##  1 流行     19 positive 
##  2 成功     15 positive 
##  3 問題     12 negative 
##  4 希望      9 positive 
##  5 決定      7 positive 
##  6 簡單      5 positive 
##  7 確定      5 positive 
##  8 順利      5 positive 
##  9 優惠      5 positive 
## 10 自由      5 positive 
## 11 改善      4 positive 
## 12 隔離      4 negative 
## 13 健康      4 positive 
## 14 解決      4 positive 
## 15 榮譽      4 positive 
## 16 效率      4 positive 
## 17 壓力      4 negative
mask_data %>% 
  select(word) %>%
  inner_join(LIWC)
## Adding missing grouping variables: `artTitle`, `artDate`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentNum`, `push`, `boo`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 156 x 11
## # Groups:   artTitle, artDate, artTime, artUrl, artPoster, artCat, commentNum,
## #   push, boo [32]
##    artTitle artDate    artTime artUrl artPoster artCat commentNum  push   boo
##    <chr>    <date>     <chr>   <chr>  <chr>     <chr>       <int> <int> <int>
##  1 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  2 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  3 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  4 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  5 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  6 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  7 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  8 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
##  9 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## 10 [爆卦]口罩實… 2020-03-10 05:42:… https… laptic    Gossi…        135    59     4
## # … with 146 more rows, and 2 more variables: word <chr>, sentiment <fct>

#以LIWC情緒字典分析

統計每天的文章正面字的次數與負面字的次數

sentiment_count = mask_data %>%
  select(artDate,word,count) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))
## Adding missing grouping variables: `artTitle`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentNum`, `push`, `boo`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d")) 

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))+
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/03/11'))
[1]])),colour = "red") 

#geom_vline畫出vertical line,xintercept告訴他要在artDate欄位的哪一個row畫線
mask_data %>% filter(artDate == as.Date('2020/03/11')) %>% distinct(artUrl, .keep_all = TRUE)
## # A tibble: 7 x 11
## # Groups:   artTitle, artDate, artTime, artUrl, artPoster, artCat, commentNum,
## #   push, boo [7]
##   artTitle artDate    artTime artUrl artPoster artCat commentNum  push   boo
##   <chr>    <date>     <chr>   <chr>  <chr>     <chr>       <int> <int> <int>
## 1 [問卦]明天口… 2020-03-11 13:24:… https… LIN9      Gossi…         17     8     1
## 2 [新聞]店員又… 2020-03-11 03:23:… https… hy543     Gossi…        203    69    19
## 3 [新聞]堅持口… 2020-03-11 06:06:… https… yankeeha… Gossi…         73    13    30
## 4 [新聞]口罩實… 2020-03-11 03:28:… https… johnson2… Gossi…         93    13    50
## 5 [新聞]口罩實… 2020-03-11 23:09:… https… zzyyxx77  Gossi…        177    20    95
## 6 [新聞]口罩實… 2020-03-11 01:07:… https… inglee    Gossi…        218    25   129
## 7 [新聞]批口罩… 2020-03-11 16:41:… https… tibo96033 Gossi…        154    12    87
## # … with 2 more variables: word <chr>, count <int>
mask_data %>% 
  filter(artDate == as.Date('2020/03/11')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>10) %>%   # 過濾出現太少次的字
  wordcloud2()
## Adding missing grouping variables: `artTitle`, `artDate`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentNum`, `push`, `boo`

哪篇文章的負面情緒最多?負面情緒的字是?

mask_data %>% 
  filter(artDate == as.Date('2020/03/11')) %>%
  inner_join(LIWC) %>% 
  filter(sentiment == "negative") %>% 
  group_by(artUrl,sentiment) %>% 
  summarise(
    artTitle = artTitle[1],
    count = n()
  ) %>% 
  arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 6 x 4
## # Groups:   artUrl [6]
##   artUrl                          sentiment artTitle                       count
##   <chr>                           <fct>     <chr>                          <int>
## 1 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]口罩實名制2.0「加7元物流費」李來希狂轟…     5
## 2 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]口罩實名制2.0付7元運費 李來希轟:把人民當腦殘…     4
## 3 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]堅持口罩2.0不該收7元運費陳玉珍:這不…     3
## 4 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]口罩實名制2.0多7元李來希:把人民當腦…     3
## 5 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]店員又要崩潰...口罩2.0「超商排隊大打…     2
## 6 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]批口罩實名制2.0李來希不滿:要先學會上…     1
mask_data %>%
  filter(artDate == as.Date('2020/03/11')) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14, family = "Heiti TC Light"))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

觀察前後一天的狀況

mask_data %>% 
  filter(artDate %in% c(as.Date('2020/03/10'))) %>%
  inner_join(LIWC) %>% 
  filter(sentiment == "negative") %>% 
  group_by(artUrl,sentiment) %>% 
  summarise(
    artTitle = artTitle[1],
    count = n()
  ) %>% 
  arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 5 x 4
## # Groups:   artUrl [5]
##   artUrl                          sentiment artTitle                       count
##   <chr>                           <fct>     <chr>                          <int>
## 1 https://www.ptt.cc/bbs/Gossipi… negative  [爆卦]口罩實名制2.0、新增兩例確診…     8
## 2 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]口罩實名制2.0周四上路線上預購每筆自付7元物流費…     3
## 3 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]口罩2.0要運費 陳玉珍竟嗆:這是小錢沒必要…     2
## 4 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]口罩實名制2.0開會影片曝光 蘇貞昌:我…     1
## 5 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]口罩實名制2.0銀行局:三種付款皆免手…     1
mask_data %>% 
  filter(artDate %in% c(as.Date('2020/03/12'))) %>%
  inner_join(LIWC) %>% 
  filter(sentiment == "negative") %>% 
  group_by(artUrl,sentiment) %>% 
  summarise(
    artTitle = artTitle[1],
    count = n()
  ) %>% 
  arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 1 x 4
## # Groups:   artUrl [1]
##   artUrl                              sentiment artTitle                   count
##   <chr>                               <fct>     <chr>                      <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M… negative  [新聞]口罩2.0上路大當機「比高鐵票還難搶」…     2
mask_data %>%
  filter(artDate == as.Date('2020/03/10')) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14, family = "Heiti TC Light"))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

mask_data %>%
  filter(artDate == as.Date('2020/03/12')) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14, family = "Heiti TC Light"))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

將同一篇的斷詞整理在一起

mask_data_full = mask_data %>% 
                group_by(artUrl, artDate) %>% 
                summarise(sentence = paste0(word, collapse = " "))
mask_data_full
## # A tibble: 34 x 3
## # Groups:   artUrl [34]
##    artUrl                       artDate    sentence                             
##    <chr>                        <date>     <chr>                                
##  1 https://www.ptt.cc/bbs/Goss… 2020-03-06 1. 12 120958 2. 3. 4. 4394302 5. 6. …
##  2 https://www.ptt.cc/bbs/Goss… 2020-03-07 1000 2000 600 變成 感覺 根本 更累 國家 歡呼 覺得 口…
##  3 https://www.ptt.cc/bbs/Goss… 2020-03-08 1. 1090308 12 2. 202003080046. 3. 4.…
##  4 https://www.ptt.cc/bbs/Goss… 2020-03-08 1. 12 1662525. 2. 20200308 3. 4. 5. …
##  5 https://www.ptt.cc/bbs/Goss… 2020-03-09 1. 1663197. 2. 2.0 20200309 3. 4. 5.…
##  6 https://www.ptt.cc/bbs/Goss… 2020-03-09 1. 100 15 2. 25 3. 4. 5. 50 6. 60 65…
##  7 https://www.ptt.cc/bbs/Goss… 2020-03-09 1. 15 2. 202003095006. 3. 4. 5. 6. 包…
##  8 https://www.ptt.cc/bbs/Goss… 2020-03-10 0800-001922 1. 10 103 12 13 136 14 1…
##  9 https://www.ptt.cc/bbs/Goss… 2020-03-10 1. 12 120958 18 19 2. 233 26 3. 4. 4…
## 10 https://www.ptt.cc/bbs/Goss… 2020-03-10 10 135 15 246 704697 包括 抱怨 報導 本來 比較 …
## # … with 24 more rows

#李來希

mask_comment = fread('Boomer_articleMetaData.csv',encoding = 'UTF-8')
head(mask_comment)
##                                      artTitle    artDate  artTime
## 1: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## 2: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## 3: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## 4: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## 5: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
## 6: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020/03/08 06:02:31
##                                                      artUrl    artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 2: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 3: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 4: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 5: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 6: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
##       artCat commentPoster commentStatus         commentDate
## 1: Gossiping  BlueBird5566            噓 2020-03-08 14:02:00
## 2: Gossiping     kodak2222            噓 2020-03-08 14:03:00
## 3: Gossiping        mkopin            噓 2020-03-08 14:03:00
## 4: Gossiping        kutkin            推 2020-03-08 14:03:00
## 5: Gossiping        hihi29            推 2020-03-08 14:03:00
## 6: Gossiping       Sougetu            推 2020-03-08 14:03:00
##                     commentContent
## 1: :台灣特色的綠色共產主義你懂個屁
## 2:                             :滾
## 3:                       :白痴閉嘴
## 4:             :重點配給還是不夠用
## 5:                 :一樓柯糞好了啦
## 6:                         :推一樓

#調整日期格式

mask_comment$artDate= mask_comment$artDate %>% as.Date("%Y/%m/%d")
head(mask_comment)
##                                      artTitle    artDate  artTime
## 1: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## 2: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## 3: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## 4: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## 5: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
## 6: [新聞]再嗆政府口罩政策李來希酸:搞到要配給 2020-03-08 06:02:31
##                                                      artUrl    artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 2: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 3: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 4: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 5: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
## 6: https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html johnson20524
##       artCat commentPoster commentStatus         commentDate
## 1: Gossiping  BlueBird5566            噓 2020-03-08 14:02:00
## 2: Gossiping     kodak2222            噓 2020-03-08 14:03:00
## 3: Gossiping        mkopin            噓 2020-03-08 14:03:00
## 4: Gossiping        kutkin            推 2020-03-08 14:03:00
## 5: Gossiping        hihi29            推 2020-03-08 14:03:00
## 6: Gossiping       Sougetu            推 2020-03-08 14:03:00
##                     commentContent
## 1: :台灣特色的綠色共產主義你懂個屁
## 2:                             :滾
## 3:                       :白痴閉嘴
## 4:             :重點配給還是不夠用
## 5:                 :一樓柯糞好了啦
## 6:                         :推一樓

#斷詞

comment_tokens <- mask_comment %>% unnest_tokens(word, commentContent, token=chi_tokenizer)
str(comment_tokens)
## Classes 'data.table' and 'data.frame':   6308 obs. of  10 variables:
##  $ artTitle     : chr  "[新聞]再嗆政府口罩政策李來希酸:搞到要配給" "[新聞]再嗆政府口罩政策李來希酸:搞到要配給" "[新聞]再嗆政府口罩政策李來希酸:搞到要配給" "[新聞]再嗆政府口罩政策李來希酸:搞到要配給" ...
##  $ artDate      : Date, format: "2020-03-08" "2020-03-08" ...
##  $ artTime      : chr  "06:02:31" "06:02:31" "06:02:31" "06:02:31" ...
##  $ artUrl       : chr  "https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html" "https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html" "https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html" "https://www.ptt.cc/bbs/Gossiping/M.1583647354.A.C5B.html" ...
##  $ artPoster    : chr  "johnson20524" "johnson20524" "johnson20524" "johnson20524" ...
##  $ artCat       : chr  "Gossiping" "Gossiping" "Gossiping" "Gossiping" ...
##  $ commentPoster: chr  "BlueBird5566" "BlueBird5566" "BlueBird5566" "BlueBird5566" ...
##  $ commentStatus: chr  "噓" "噓" "噓" "噓" ...
##  $ commentDate  : chr  "2020-03-08 14:02:00" "2020-03-08 14:02:00" "2020-03-08 14:02:00" "2020-03-08 14:02:00" ...
##  $ word         : chr  "台灣" "特色" "綠色" "共產主義" ...
##  - attr(*, ".internal.selfref")=<externalptr>

文字雲

# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
comment_data <- comment_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by_all() %>% 
  summarise(count = n())

# 印出最常見的20個詞彙
head(comment_data, 20)
## # A tibble: 20 x 11
## # Groups:   artTitle, artDate, artTime, artUrl, artPoster, artCat,
## #   commentPoster, commentStatus, commentDate [8]
##    artTitle artDate    artTime artUrl artPoster artCat commentPoster
##    <chr>    <date>     <chr>   <chr>  <chr>     <chr>  <chr>        
##  1 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… a126451026   
##  2 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… a126451026   
##  3 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… a126451026   
##  4 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… a126451026   
##  5 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… abacada      
##  6 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… acer758219   
##  7 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… acserro      
##  8 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… acserro      
##  9 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… ADEMAIN      
## 10 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… ADEMAIN      
## 11 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… afternight   
## 12 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… afternight   
## 13 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… afternight   
## 14 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… afternight   
## 15 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… afternight   
## 16 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… alex0618     
## 17 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… alex0618     
## 18 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… alex0618     
## 19 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… alex0618     
## 20 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… alex0618     
## # … with 4 more variables: commentStatus <chr>, commentDate <chr>, word <chr>,
## #   count <int>

計算所有字在文集中的總詞頻

comment_count <- comment_data %>%
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))
## Adding missing grouping variables: `artTitle`, `artDate`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentPoster`, `commentStatus`, `commentDate`
comment_data %>% 
  select(word) %>%
  inner_join(LIWC)
## Adding missing grouping variables: `artTitle`, `artDate`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentPoster`, `commentStatus`, `commentDate`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 642 x 11
## # Groups:   artTitle, artDate, artTime, artUrl, artPoster, artCat,
## #   commentPoster, commentStatus, commentDate [576]
##    artTitle artDate    artTime artUrl artPoster artCat commentPoster
##    <chr>    <date>     <chr>   <chr>  <chr>     <chr>  <chr>        
##  1 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… acer758219   
##  2 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… acserro      
##  3 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… angel6502    
##  4 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… Ark727       
##  5 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… asuka99      
##  6 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… banana5566   
##  7 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… bardah2c     
##  8 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… bengret      
##  9 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… carlos5978   
## 10 [新聞]爆粗口… 2020-03-11 12:15:… https… haehae31… Gossi… chrisjeremy  
## # … with 632 more rows, and 4 more variables: commentStatus <chr>,
## #   commentDate <chr>, word <chr>, sentiment <fct>

#將同一篇的斷詞整理在一起

comment_data_full = comment_data %>% 
                group_by(artUrl, artDate) %>% 
                summarise(sentence = paste0(word, collapse = " "))
# 要
boomer = comment_data_full$artUrl[grepl("李來希", comment_data_full$sentence)]
comment_data %>% filter(artUrl %in% boomer) %>% 
  select(artDate,word,count) %>% 
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))
## Adding missing grouping variables: `artTitle`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentPoster`, `commentStatus`, `commentDate`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 12 x 3
## # Groups:   artDate [6]
##    artDate    sentiment count
##    <date>     <fct>     <int>
##  1 2020-03-08 positive     14
##  2 2020-03-08 negative     31
##  3 2020-03-11 positive     48
##  4 2020-03-11 negative    147
##  5 2020-03-13 positive     56
##  6 2020-03-13 negative    102
##  7 2020-03-15 positive     10
##  8 2020-03-15 negative     22
##  9 2020-03-16 positive      8
## 10 2020-03-16 negative     27
## 11 2020-03-18 positive     39
## 12 2020-03-18 negative     66
comment_data %>% filter(artUrl %in% boomer) %>% 
  select(artDate,word,count) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count)) %>% 
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))
## Adding missing grouping variables: `artTitle`, `artTime`, `artUrl`, `artPoster`, `artCat`, `commentPoster`, `commentStatus`, `commentDate`
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector