Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(wordcloud)
## Loading required package: RColorBrewer
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(readr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
##文字平台收集PTT八卦版2020-03-06 ~ 2020-03-20 關鍵字:口罩2.0所有文章 資料集: mask_articleMetaData.csv
data = fread('/Users/bonniechen/Desktop/mask/mask_artWordFreq.csv',encoding = 'UTF-8')
查看資料前幾筆(已經整理成文章-詞彙-詞頻)
head(data)
## artTitle artDate artTime
## 1: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## 2: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## 3: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## 4: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## 5: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## 6: [新聞]政院研擬「口罩實名制2.0」最快下周四上 2020/03/06 09:11:39
## artUrl word count
## 1: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html 口罩 5
## 2: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html 討論 5
## 3: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html 實名制 4
## 4: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html 陳其邁 4
## 5: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html 完整 3
## 6: https://www.ptt.cc/bbs/Gossiping/M.1583485901.A.756.html 最快 3
過濾特殊字元
data = data %>%
filter(!grepl('_',word))
轉換日期格式
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
word_count <- data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count
## # A tibble: 367 x 2
## word count
## <chr> <int>
## 1 口罩 189
## 2 預購 96
## 3 口罩實名制2.0 76
## 4 完整 75
## 5 購買 60
## 6 新聞 55
## 7 健保 54
## 8 民眾 45
## 9 記者 40
## 10 指揮中心 40
## # … with 357 more rows
全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版
# 正向字典txt檔
# 以,將字分隔
P <- read_file("/Users/bonniechen/Desktop/mask/dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("/Users/bonniechen/Desktop/mask/dict/liwc/negative.txt")
#字典txt檔讀進來是一個字串
typeof(P)
## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
文集中的字出現在LIWC字典中是屬於positive還是negative
word_count %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 18 x 3
## word count sentiment
## <chr> <int> <fct>
## 1 流行 19 positive
## 2 成功 14 positive
## 3 問題 12 negative
## 4 希望 9 positive
## 5 決定 7 positive
## 6 簡單 5 positive
## 7 順利 5 positive
## 8 死人 5 negative
## 9 優惠 5 positive
## 10 自由 5 positive
## 11 才能 4 positive
## 12 改善 4 positive
## 13 隔離 4 negative
## 14 健康 4 positive
## 15 確定 4 positive
## 16 榮譽 4 positive
## 17 效率 4 positive
## 18 壓力 4 negative
data %>%
select(word) %>%
inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## word sentiment
## 1 適時 positive
## 2 讚賞 positive
## 3 投入 positive
## 4 希望 positive
## 5 歡呼 positive
## 6 希望 positive
## 7 混亂 negative
## 8 壓力 negative
## 9 確定 positive
## 10 負擔 negative
## 11 願意 positive
## 12 希望 positive
## 13 自由 positive
## 14 流行 positive
## 15 確定 positive
## 16 擔心 negative
## 17 決定 positive
## 18 希望 positive
## 19 流行 positive
## 20 隔離 negative
## 21 健康 positive
## 22 死亡 negative
## 23 成功 positive
## 24 八卦 negative
## 25 濫用 negative
## 26 改善 positive
## 27 問題 negative
## 28 不受 negative
## 29 放棄 negative
## 30 確定 positive
## 31 禮節 positive
## 32 不適 negative
## 33 主動 positive
## 34 及時 positive
## 35 成功 positive
## 36 流行 positive
## 37 改善 positive
## 38 問題 negative
## 39 不受 negative
## 40 放棄 negative
## 41 不便 negative
## 42 決定 positive
## 43 抱怨 negative
## 44 完善 positive
## 45 外行 negative
## 46 希望 positive
## 47 健康 positive
## 48 安全 positive
## 49 答應 positive
## 50 成功 positive
## 51 吸引 positive
## 52 分享 positive
## 53 願意 positive
## 54 效率 positive
## 55 謝謝 positive
## 56 安心 positive
## 57 流行 positive
## 58 問題 negative
## 59 問題 negative
## 60 榮譽 positive
## 61 簡單 positive
## 62 不便 negative
## 63 不足 negative
## 64 成功 positive
## 65 死人 negative
## 66 不解 negative
## 67 擔心 negative
## 68 效率 positive
## 69 輕鬆 positive
## 70 混亂 negative
## 71 問題 negative
## 72 簡單 positive
## 73 批評 negative
## 74 不足 negative
## 75 成功 positive
## 76 死人 negative
## 77 不解 negative
## 78 可憐 negative
## 79 有效 positive
## 80 鼓勵 positive
## 81 負擔 negative
## 82 抨擊 negative
## 83 批評 negative
## 84 才能 positive
## 85 不滿 negative
## 86 死人 negative
## 87 自由 positive
## 88 榮譽 positive
## 89 困擾 negative
## 90 偉大 positive
## 91 天才 positive
## 92 才能 positive
## 93 問題 negative
## 94 榮譽 positive
## 95 死人 negative
## 96 不足 negative
## 97 成功 positive
## 98 不解 negative
## 99 優惠 positive
## 100 成功 positive
## 101 優惠 positive
## 102 重要 positive
## 103 鼓勵 positive
## 104 最好 positive
## 105 順利 positive
## 106 流行 positive
## 107 效率 positive
## 108 簡單 positive
## 109 不錯 positive
## 110 相信 positive
## 111 希望 positive
## 112 順利 positive
## 113 協助 positive
## 114 改善 positive
## 115 成功 positive
## 116 麻煩 negative
## 117 問題 negative
## 118 流行 positive
## 119 決定 positive
## 120 詐騙 negative
## 121 疑惑 negative
## 122 歡迎 positive
## 123 感謝 positive
## 124 協助 positive
## 125 分享 positive
## 126 決定 positive
## 127 希望 positive
## 128 滿足 positive
## 129 解決 positive
## 130 效率 positive
## 131 優點 positive
## 132 友善 positive
## 133 公正 positive
## 134 公平 positive
## 135 降低 negative
## 136 壓力 negative
## 137 朋友 positive
## 138 功勞 positive
## 139 自由 positive
## 140 付出 positive
## 141 不公平 negative
## 142 可怕 negative
## 143 可愛 positive
## 144 犧牲 negative
## 145 不平 negative
## 146 流行 positive
## 147 積極 positive
## 148 成功 positive
## 149 自由 positive
## 150 決定 positive
## 151 流行 positive
## 152 成功 positive
## 153 八卦 negative
## 154 帥氣 positive
## 155 確定 positive
## 156 才能 positive
## 157 朋友 positive
## 158 流行 positive
## 159 自由 positive
## 160 才能 positive
## 161 爆發 negative
#以LIWC情緒字典分析
sentiment_count = data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
正面>負面,實施口罩2.0可能帶來正面影響
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/03/10'))
[1]])),colour = "red") +
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/03/11'))
[1]])),colour = "blue")
#geom_vline畫出vertical line,xintercept告訴他要在artDate欄位的哪一個row畫線
透過觀察情緒變化來回顧事件內容
data %>% filter(artDate == as.Date('2020/03/10')) %>% distinct(artUrl, .keep_all = TRUE)
## artTitle artDate artTime
## 1 [爆卦]口罩實名制2.0、新增兩例確診 2020-03-10 05:42:41
## 2 [新聞]口罩實名制2.0周四上路線上預購每筆自付7元物流費 2020-03-10 06:02:01
## 3 [新聞]口罩2.0要運費 陳玉珍竟嗆:這是小錢沒必要 2020-03-10 06:48:23
## 4 [新聞]口罩實名制2.0開會影片曝光 蘇貞昌:我 2020-03-10 08:48:00
## 5 [新聞]口罩實名制2.0銀行局:三種付款皆免手 2020-03-10 15:55:32
## artUrl word count
## 1 https://www.ptt.cc/bbs/Gossiping/M.1583818963.A.92D.html 指揮中心 16
## 2 https://www.ptt.cc/bbs/Gossiping/M.1583820123.A.075.html 預購 11
## 3 https://www.ptt.cc/bbs/Gossiping/M.1583822906.A.D81.html 運費 11
## 4 https://www.ptt.cc/bbs/Gossiping/M.1583830083.A.04B.html 口罩實名制2.0 5
## 5 https://www.ptt.cc/bbs/Gossiping/M.1583855734.A.AE2.html 信用卡 7
data %>% filter(artDate == as.Date('2020/03/11')) %>% distinct(artUrl, .keep_all = TRUE)
## artTitle artDate artTime
## 1 [新聞]口罩實名制2.0付7元運費 李來希轟:把人民當腦殘? 2020-03-11 01:07:39
## 2 [新聞]店員又要崩潰...口罩2.0「超商排隊大打 2020-03-11 03:23:27
## 3 [新聞]口罩實名制2.0「加7元物流費」李來希狂轟 2020-03-11 03:28:34
## 4 [新聞]堅持口罩2.0不該收7元運費陳玉珍:這不 2020-03-11 06:06:52
## 5 [問卦]明天口罩2.0 2020-03-11 13:24:59
## 6 [新聞]批口罩實名制2.0李來希不滿:要先學會上 2020-03-11 16:41:13
## 7 [新聞]口罩實名制2.0多7元李來希:把人民當腦 2020-03-11 23:09:17
## artUrl word count
## 1 https://www.ptt.cc/bbs/Gossiping/M.1583888863.A.EC6.html 李來希 9
## 2 https://www.ptt.cc/bbs/Gossiping/M.1583897009.A.C25.html 口罩 8
## 3 https://www.ptt.cc/bbs/Gossiping/M.1583897319.A.885.html 人民 8
## 4 https://www.ptt.cc/bbs/Gossiping/M.1583906818.A.4C7.html 口罩 7
## 5 https://www.ptt.cc/bbs/Gossiping/M.1583933101.A.91B.html 口罩 4
## 6 https://www.ptt.cc/bbs/Gossiping/M.1583944875.A.0C3.html 口罩 8
## 7 https://www.ptt.cc/bbs/Gossiping/M.1583968160.A.3F5.html 口罩 9
## 3/10文字雲
data %>%
filter(artDate == as.Date('2020/03/10')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>6) %>% # 過濾出現太少次的字
wordcloud2()
## 3/11文字雲
data %>%
filter(artDate == as.Date('2020/03/11')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>5) %>% # 過濾出現太少次的字
wordcloud2()
哪篇文章的負面情緒最多?負面情緒的字是?
data %>%
filter(artDate == as.Date('2020/03/11')) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 6 x 4
## # Groups: artUrl [6]
## artUrl sentiment artTitle count
## <chr> <fct> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossipi… negative [新聞]口罩實名制2.0「加7元物流費」李來希狂轟… 6
## 2 https://www.ptt.cc/bbs/Gossipi… negative [新聞]口罩實名制2.0付7元運費 李來希轟:把人民當腦殘… 5
## 3 https://www.ptt.cc/bbs/Gossipi… negative [新聞]口罩實名制2.0多7元李來希:把人民當腦… 4
## 4 https://www.ptt.cc/bbs/Gossipi… negative [新聞]堅持口罩2.0不該收7元運費陳玉珍:這不… 3
## 5 https://www.ptt.cc/bbs/Gossipi… negative [新聞]批口罩實名制2.0李來希不滿:要先學會上… 3
## 6 https://www.ptt.cc/bbs/Gossipi… negative [新聞]店員又要崩潰...口罩2.0「超商排隊大打… 2
data %>%
filter(artDate == as.Date('2020/03/11')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
data %>%
filter(artDate == as.Date('2020/03/10')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
data %>%
filter(artDate == as.Date('2020/03/12')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector