系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"

安裝需要的packages

packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(ggplot2)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

library(wordcloud)

## Loading required package: RColorBrewer

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

library(readr)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

##文字平台收集PTT八卦版2020-03-06 ～ 2020-03-20 關鍵字：口罩2.0所有文章資料集: mask_articleMetaData.csv

data = fread('/Users/bonniechen/Desktop/mask/mask_artWordFreq.csv',encoding = 'UTF-8')

查看資料前幾筆(已經整理成文章-詞彙-詞頻)

head(data)

##                                       artTitle    artDate  artTime
## 1: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## 2: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## 3: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## 4: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## 5: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## 6: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
##                                                      artUrl   word count
## 1: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html   口罩     5
## 2: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html   討論     5
## 3: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html 實名制     4
## 4: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html 陳其邁     4
## 5: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html   完整     3
## 6: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html   最快     3

過濾特殊字元

data = data %>% 
  filter(!grepl('_',word))

轉換日期格式

data$artDate= data$artDate %>% as.Date("%Y/%m/%d")

計算所有字在文集中的總詞頻

word_count <- data %>%
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))

word_count

## # A tibble: 367 x 2
##    word          count
##    <chr>         <int>
##  1 口罩            189
##  2 預購             96
##  3 口罩實名制2.0    76
##  4 完整             75
##  5 購買             60
##  6 新聞             55
##  7 健保             54
##  8 民眾             45
##  9 記者             40
## 10 指揮中心         40
## # … with 357 more rows

準備LIWC字典

全名Linguistic Inquiry and Word Counts，由心理學家Pennebaker於2001出版

以LIWC字典判斷文集中的word屬於正面字還是負面字

# 正向字典txt檔
# 以,將字分隔
P <- read_file("/Users/bonniechen/Desktop/mask/dict/liwc/positive.txt")

# 負向字典txt檔
N <- read_file("/Users/bonniechen/Desktop/mask/dict/liwc/negative.txt")

#字典txt檔讀進來是一個字串
typeof(P)

## [1] "character"

#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")

LIWC = rbind(P, N)

head(LIWC)

##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

與LIWC情緒字典做join

文集中的字出現在LIWC字典中是屬於positive還是negative

word_count %>% inner_join(LIWC)

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

## # A tibble: 18 x 3
##    word  count sentiment
##    <chr> <int> <fct>    
##  1 流行     19 positive 
##  2 成功     14 positive 
##  3 問題     12 negative 
##  4 希望      9 positive 
##  5 決定      7 positive 
##  6 簡單      5 positive 
##  7 順利      5 positive 
##  8 死人      5 negative 
##  9 優惠      5 positive 
## 10 自由      5 positive 
## 11 才能      4 positive 
## 12 改善      4 positive 
## 13 隔離      4 negative 
## 14 健康      4 positive 
## 15 確定      4 positive 
## 16 榮譽      4 positive 
## 17 效率      4 positive 
## 18 壓力      4 negative

data %>% 
  select(word) %>%
  inner_join(LIWC)

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

##       word sentiment
## 1     適時  positive
## 2     讚賞  positive
## 3     投入  positive
## 4     希望  positive
## 5     歡呼  positive
## 6     希望  positive
## 7     混亂  negative
## 8     壓力  negative
## 9     確定  positive
## 10    負擔  negative
## 11    願意  positive
## 12    希望  positive
## 13    自由  positive
## 14    流行  positive
## 15    確定  positive
## 16    擔心  negative
## 17    決定  positive
## 18    希望  positive
## 19    流行  positive
## 20    隔離  negative
## 21    健康  positive
## 22    死亡  negative
## 23    成功  positive
## 24    八卦  negative
## 25    濫用  negative
## 26    改善  positive
## 27    問題  negative
## 28    不受  negative
## 29    放棄  negative
## 30    確定  positive
## 31    禮節  positive
## 32    不適  negative
## 33    主動  positive
## 34    及時  positive
## 35    成功  positive
## 36    流行  positive
## 37    改善  positive
## 38    問題  negative
## 39    不受  negative
## 40    放棄  negative
## 41    不便  negative
## 42    決定  positive
## 43    抱怨  negative
## 44    完善  positive
## 45    外行  negative
## 46    希望  positive
## 47    健康  positive
## 48    安全  positive
## 49    答應  positive
## 50    成功  positive
## 51    吸引  positive
## 52    分享  positive
## 53    願意  positive
## 54    效率  positive
## 55    謝謝  positive
## 56    安心  positive
## 57    流行  positive
## 58    問題  negative
## 59    問題  negative
## 60    榮譽  positive
## 61    簡單  positive
## 62    不便  negative
## 63    不足  negative
## 64    成功  positive
## 65    死人  negative
## 66    不解  negative
## 67    擔心  negative
## 68    效率  positive
## 69    輕鬆  positive
## 70    混亂  negative
## 71    問題  negative
## 72    簡單  positive
## 73    批評  negative
## 74    不足  negative
## 75    成功  positive
## 76    死人  negative
## 77    不解  negative
## 78    可憐  negative
## 79    有效  positive
## 80    鼓勵  positive
## 81    負擔  negative
## 82    抨擊  negative
## 83    批評  negative
## 84    才能  positive
## 85    不滿  negative
## 86    死人  negative
## 87    自由  positive
## 88    榮譽  positive
## 89    困擾  negative
## 90    偉大  positive
## 91    天才  positive
## 92    才能  positive
## 93    問題  negative
## 94    榮譽  positive
## 95    死人  negative
## 96    不足  negative
## 97    成功  positive
## 98    不解  negative
## 99    優惠  positive
## 100   成功  positive
## 101   優惠  positive
## 102   重要  positive
## 103   鼓勵  positive
## 104   最好  positive
## 105   順利  positive
## 106   流行  positive
## 107   效率  positive
## 108   簡單  positive
## 109   不錯  positive
## 110   相信  positive
## 111   希望  positive
## 112   順利  positive
## 113   協助  positive
## 114   改善  positive
## 115   成功  positive
## 116   麻煩  negative
## 117   問題  negative
## 118   流行  positive
## 119   決定  positive
## 120   詐騙  negative
## 121   疑惑  negative
## 122   歡迎  positive
## 123   感謝  positive
## 124   協助  positive
## 125   分享  positive
## 126   決定  positive
## 127   希望  positive
## 128   滿足  positive
## 129   解決  positive
## 130   效率  positive
## 131   優點  positive
## 132   友善  positive
## 133   公正  positive
## 134   公平  positive
## 135   降低  negative
## 136   壓力  negative
## 137   朋友  positive
## 138   功勞  positive
## 139   自由  positive
## 140   付出  positive
## 141 不公平  negative
## 142   可怕  negative
## 143   可愛  positive
## 144   犧牲  negative
## 145   不平  negative
## 146   流行  positive
## 147   積極  positive
## 148   成功  positive
## 149   自由  positive
## 150   決定  positive
## 151   流行  positive
## 152   成功  positive
## 153   八卦  negative
## 154   帥氣  positive
## 155   確定  positive
## 156   才能  positive
## 157   朋友  positive
## 158   流行  positive
## 159   自由  positive
## 160   才能  positive
## 161   爆發  negative

#以LIWC情緒字典分析

統計每天的文章正面字的次數與負面字的次數

sentiment_count = data %>%
  select(artDate,word,count) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))

正面>負面，實施口罩2.0可能帶來正面影響

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))+
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/03/10'))
[1]])),colour = "red") +
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/03/11'))
[1]])),colour = "blue")

#geom_vline畫出vertical line，xintercept告訴他要在artDate欄位的哪一個row畫線

透過觀察情緒變化來回顧事件內容

正面詞彙最多(3/10)

data %>% filter(artDate == as.Date('2020/03/10')) %>% distinct(artUrl, .keep_all = TRUE)

##                                               artTitle    artDate  artTime
## 1                    [爆卦]口罩實名制2.0、新增兩例確診 2020-03-10 05:42:41
## 2 [新聞]口罩實名制2.0周四上路線上預購每筆自付7元物流費 2020-03-10 06:02:01
## 3      [新聞]口罩2.0要運費　陳玉珍竟嗆：這是小錢沒必要 2020-03-10 06:48:23
## 4          [新聞]口罩實名制2.0開會影片曝光　蘇貞昌：我 2020-03-10 08:48:00
## 5            [新聞]口罩實名制2.0銀行局：三種付款皆免手 2020-03-10 15:55:32
##                                                     artUrl          word count
## 1 https://www.ptt.cc/bbs/Gossiping/M.1583818963.A.92D.html      指揮中心    16
## 2 https://www.ptt.cc/bbs/Gossiping/M.1583820123.A.075.html          預購    11
## 3 https://www.ptt.cc/bbs/Gossiping/M.1583822906.A.D81.html          運費    11
## 4 https://www.ptt.cc/bbs/Gossiping/M.1583830083.A.04B.html 口罩實名制2.0     5
## 5 https://www.ptt.cc/bbs/Gossiping/M.1583855734.A.AE2.html        信用卡     7

負面詞彙最多(3/11)

data %>% filter(artDate == as.Date('2020/03/11')) %>% distinct(artUrl, .keep_all = TRUE)

##                                                 artTitle    artDate  artTime
## 1 [新聞]口罩實名制2.0付7元運費　李來希轟：把人民當腦殘？ 2020-03-11 01:07:39
## 2             [新聞]店員又要崩潰...口罩2.0「超商排隊大打 2020-03-11 03:23:27
## 3           [新聞]口罩實名制2.0「加7元物流費」李來希狂轟 2020-03-11 03:28:34
## 4             [新聞]堅持口罩2.0不該收7元運費陳玉珍：這不 2020-03-11 06:06:52
## 5                                      [問卦]明天口罩2.0 2020-03-11 13:24:59
## 6            [新聞]批口罩實名制2.0李來希不滿：要先學會上 2020-03-11 16:41:13
## 7             [新聞]口罩實名制2.0多7元李來希：把人民當腦 2020-03-11 23:09:17
##                                                     artUrl   word count
## 1 https://www.ptt.cc/bbs/Gossiping/M.1583888863.A.EC6.html 李來希     9
## 2 https://www.ptt.cc/bbs/Gossiping/M.1583897009.A.C25.html   口罩     8
## 3 https://www.ptt.cc/bbs/Gossiping/M.1583897319.A.885.html   人民     8
## 4 https://www.ptt.cc/bbs/Gossiping/M.1583906818.A.4C7.html   口罩     7
## 5 https://www.ptt.cc/bbs/Gossiping/M.1583933101.A.91B.html   口罩     4
## 6 https://www.ptt.cc/bbs/Gossiping/M.1583944875.A.0C3.html   口罩     8
## 7 https://www.ptt.cc/bbs/Gossiping/M.1583968160.A.3F5.html   口罩     9

#＃ 3/10文字雲

data %>% 
  filter(artDate == as.Date('2020/03/10')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>6) %>%   # 過濾出現太少次的字
  wordcloud2()

#＃ 3/11文字雲

data %>% 
  filter(artDate == as.Date('2020/03/11')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>5) %>%   # 過濾出現太少次的字
  wordcloud2()

哪篇文章的負面情緒最多？負面情緒的字是？

data %>% 
  filter(artDate == as.Date('2020/03/11')) %>%
  inner_join(LIWC) %>% 
  filter(sentiment == "negative") %>% 
  group_by(artUrl,sentiment) %>% 
  summarise(
    artTitle = artTitle[1],
    count = n()
  ) %>% 
  arrange(desc(count))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

## # A tibble: 6 x 4
## # Groups:   artUrl [6]
##   artUrl                          sentiment artTitle                       count
##   <chr>                           <fct>     <chr>                          <int>
## 1 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]口罩實名制2.0「加7元物流費」李來希狂轟…     6
## 2 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]口罩實名制2.0付7元運費　李來希轟：把人民當腦殘…     5
## 3 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]口罩實名制2.0多7元李來希：把人民當腦…     4
## 4 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]堅持口罩2.0不該收7元運費陳玉珍：這不…     3
## 5 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]批口罩實名制2.0李來希不滿：要先學會上…     3
## 6 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]店員又要崩潰...口罩2.0「超商排隊大打…     2

data %>%
  filter(artDate == as.Date('2020/03/11')) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14, family = "Heiti TC Light"))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

觀察前後一天的狀況

data %>%
  filter(artDate == as.Date('2020/03/10')) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14, family = "Heiti TC Light"))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

data %>%
  filter(artDate == as.Date('2020/03/12')) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14, family = "Heiti TC Light"))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

以PTT爬蟲_口罩2.0來做情緒分析

陳思涵

2020/3/26

系統參數設定

安裝需要的packages

計算所有字在文集中的總詞頻

準備LIWC字典

以LIWC字典判斷文集中的word屬於正面字還是負面字

與LIWC情緒字典做join

統計每天的文章正面字的次數與負面字的次數

正面詞彙最多(3/10)

負面詞彙最多(3/11)

觀察前後一天的狀況