Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.0.2
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(wordcloud)
## Loading required package: RColorBrewer
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(readr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
data = fread('corona_artWordFreq.csv',encoding = 'UTF-8')
過濾特殊字元
data = data %>%
filter(!grepl('_',word))
轉換日期格式
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
word_count <- data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count
## # A tibble: 19,070 x 2
## word count
## <chr> <int>
## 1 武漢 8630
## 2 肺炎 8512
## 3 疫情 8456
## 4 確診 7913
## 5 口罩 7251
## 6 感染 6489
## 7 中國 6372
## 8 發稿 5316
## 9 病毒 5240
## 10 台灣 4841
## # ... with 19,060 more rows
全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版
# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
#字典txt檔讀進來是一個字串
typeof(P)
## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N) #rbind把兩個coulmn 數一樣的dataframe垂直合併在一起
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
文集中的字出現在LIWC字典中是屬於positive還是negative
word_count %>% inner_join(LIWC)
## Joining, by = "word"
## # A tibble: 684 x 3
## word count sentiment
## <chr> <int> <chr>
## 1 隔離 4464 negative
## 2 死亡 1989 negative
## 3 問題 1620 negative
## 4 治療 1480 positive
## 5 健康 1337 positive
## 6 流行 1304 positive
## 7 嚴重 1249 negative
## 8 爆發 1190 negative
## 9 希望 1070 positive
## 10 風險 986 negative
## # ... with 674 more rows
# data %>%
# select(word) %>%
# inner_join(LIWC) #簡單看一下哪些字是正負向的
統計每天的文章正面字的次數與負面字的次數
# 根據每天做分組,根據字頻來分群
sentiment_count = data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
## Joining, by = "word"
###畫圖
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
疫情是一個負面的議題,可想而見的是負面字頻都是大於正面字頻的
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/03/15'))
[1]])),colour = "red")
#geom_vline畫出vertical line,xintercept告訴他要在artDate欄位的哪一個row畫線 透過觀察情緒變化來回顧事件內容
# data %>% filter(artDate == as.Date('2020/03/15')) %>% distinct(artUrl, .keep_all = TRUE)
data %>%
filter(artDate == as.Date('2020/03/15')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>20) %>% # 過濾出現太少次的字
wordcloud2()
data %>%
filter(artDate == as.Date('2020/03/15')) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## # A tibble: 184 x 4
## # Groups: artUrl [184]
## artUrl sentiment artTitle count
## <chr> <chr> <chr> <int>
## 1 https://www.ptt.cc/bbs/nCoV2019~ negative [新聞]美國防疫遺忘的「街友感染危機」~ 31
## 2 https://www.ptt.cc/bbs/nCoV2019~ negative Re:[問題]三月份還出國旅遊真的很自私?~ 24
## 3 https://www.ptt.cc/bbs/nCoV2019~ negative Re:[新聞]紐時:中國為西方贏得時間西方卻浪費了它~ 21
## 4 https://www.ptt.cc/bbs/nCoV2019~ negative [情報]0315CECC疫情指揮中心記者會摘要~ 17
## 5 https://www.ptt.cc/bbs/nCoV2019~ negative [新聞]疫情下傳中國官方要求“停止妖魔化外國”~ 16
## 6 https://www.ptt.cc/bbs/nCoV2019~ negative [情報]0315CECC疫情指揮中心記者會摘要~ 14
## 7 https://www.ptt.cc/bbs/nCoV2019~ negative [新聞]矽谷防疫亂象:改在家上班人潮湧進賣場~ 13
## 8 https://www.ptt.cc/bbs/nCoV2019~ negative Re:[問題]三月份還出國旅遊真的很自私?~ 13
## 9 https://www.ptt.cc/bbs/nCoV2019~ negative Re:[討論]健康人戴口罩真的沒有用嗎?~ 12
## 10 https://www.ptt.cc/bbs/nCoV2019~ negative Re:[問題]三月份還出國旅遊真的很自私?~ 11
## # ... with 174 more rows
###畫圖
data %>%
filter(artDate == as.Date('2020/03/15')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
觀察前後一天的狀況
data %>%
filter(artDate %in% c(as.Date('2020/03/14'))) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## # A tibble: 123 x 4
## # Groups: artUrl [123]
## artUrl sentiment artTitle count
## <chr> <chr> <chr> <int>
## 1 https://www.ptt.cc/bbs/nCoV2~ negative [新聞]紐時:中國為西方贏得時間西方卻浪費了它~ 38
## 2 https://www.ptt.cc/bbs/nCoV2~ negative Re:[情報]英國專家:群體免疫需感染60%國民~ 25
## 3 https://www.ptt.cc/bbs/nCoV2~ negative [新聞]經濟學人流行病政治學 25
## 4 https://www.ptt.cc/bbs/nCoV2~ negative [新聞]馬尼拉封城30天,杜特蒂:謝謝習近平~ 18
## 5 https://www.ptt.cc/bbs/nCoV2~ negative [情報]環球時報社評:瑞典向新冠病毒投降將害人~ 15
## 6 https://www.ptt.cc/bbs/nCoV2~ negative [新聞]護理師接武漢台人…摘口罩「臉全壓痕」 ~ 12
## 7 https://www.ptt.cc/bbs/nCoV2~ negative [情報]東奧延辦誰說了算?病毒賽跑鈔票,贊助商砌成的「東京奧~ 12
## 8 https://www.ptt.cc/bbs/nCoV2~ negative [情報]為什麼「別讓聯合國等於中國」,現在變得~ 12
## 9 https://www.ptt.cc/bbs/nCoV2~ negative Re:[新聞]紐時:中國為西方贏得時間西方卻浪費了它~ 11
## 10 https://www.ptt.cc/bbs/nCoV2~ negative [新聞]蘋論:疫情全球釀災中國厚顏卸責~ 10
## # ... with 113 more rows
data %>%
filter(artDate %in% c(as.Date('2020/03/16'))) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## # A tibble: 171 x 4
## # Groups: artUrl [171]
## artUrl sentiment artTitle count
## <chr> <chr> <chr> <int>
## 1 https://www.ptt.cc/bbs/nCoV2019~ negative Fw:[爆卦]PTT創世神:英首相要讓大家多染病是假新~ 34
## 2 https://www.ptt.cc/bbs/nCoV2019~ negative [情報]0316CECC疫情指揮中心記者會摘要~ 24
## 3 https://www.ptt.cc/bbs/nCoV2019~ negative [新聞]南韓「教會疫情」再次威脅首都圈:城南~ 15
## 4 https://www.ptt.cc/bbs/nCoV2019~ negative Re:[情報]新增8確診個案,均於國外被感染~ 12
## 5 https://www.ptt.cc/bbs/nCoV2019~ negative Re:[討論]歐洲各國學生急於返台,台灣該如何應對?~ 11
## 6 https://www.ptt.cc/bbs/nCoV2019~ negative [新聞]看不慣英國和瑞士佛系防疫星國部長:~ 11
## 7 https://www.ptt.cc/bbs/nCoV2019~ negative [新聞]美國律師提集體訟訴狀告中國引發武漢肺~ 11
## 8 https://www.ptt.cc/bbs/nCoV2019~ negative [新聞]美國律師提集體訴訟狀告中國引發武漢新~ 11
## 9 https://www.ptt.cc/bbs/nCoV2019~ negative [情報]英國金融時報即時追蹤圖 9
## 10 https://www.ptt.cc/bbs/nCoV2019~ negative [情報]俄國有63人確診官員:將爆發社區感染~ 8
## # ... with 161 more rows
data %>%
filter(artDate == as.Date('2020/03/14')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
data %>%
filter(artDate == as.Date('2020/03/16')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
隔離、死亡、嚴重等是疫情常見負面字眼,「自私」是15號與前後一天不同的字詞。可觀察到在15號Po文較前後一天特別聚焦在此負面話題。
####國家名字資料集
country = fread('country.csv',encoding = 'UTF-8')
colnames(country)[1] = "country"
將臺灣/台灣統一成臺灣,方便後續篩選
data$word[which(data$word == "台灣")] = "臺灣"
將同一篇的斷詞整理在一起
data_full = data %>% select(artUrl,word) %>%
group_by(artUrl) %>%
summarise(sentence = paste0(word, collapse = " "))
###台灣
只選擇文章中指出現指定國名、不出現其他國名的文章。因為如果一篇文章中出現多個國名,目前無法判斷情緒是針對哪個國家
# 要排除的國名
exclude = paste(country$country[country$country != "臺灣"],collapse="|")
# 要
taiwan = data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("臺灣", data_full$sentence)]
data %>% filter(artUrl %in% taiwan) %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
## Joining, by = "word"
###中國
exclude = paste(country$country[country$country != "中國"],collapse="|")
china = data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("中國", data_full$sentence)]
data %>% filter(artUrl %in% china) %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
## Joining, by = "word"
chinaData = data %>% filter(artUrl %in% china) %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
## Joining, by = "word"
#中國在哪一天的負面情緒達到最高
chinaData$artDate[which.max(chinaData$count[chinaData$sentiment == "negative"])]
## [1] "2020-02-06"
data %>% filter(artUrl %in% china) %>%
group_by(artDate,artUrl) %>%
summarise() %>%
group_by(artDate) %>%
summarise(
count = n()
)
## # A tibble: 55 x 2
## artDate count
## <date> <int>
## 1 2020-01-26 14
## 2 2020-01-27 10
## 3 2020-01-28 10
## 4 2020-01-29 5
## 5 2020-01-30 7
## 6 2020-01-31 6
## 7 2020-02-01 8
## 8 2020-02-02 4
## 9 2020-02-03 11
## 10 2020-02-04 15
## # ... with 45 more rows
######中國在2/6的討論篇數並不特別多,有可能是內容文字比較多
###韓國
exclude = paste(country$country[country$country != "韓國"],collapse="|")
korea = data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("韓國", data_full$sentence)]
data %>% filter(artUrl %in% korea) %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
## Joining, by = "word"
koreaData = data %>% filter(artUrl %in% korea) %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
## Joining, by = "word"
#韓國在哪一天的負面情緒達到最高
koreaData$artDate[which.max(koreaData$count[koreaData$sentiment == "negative"])]
## [1] "2020-02-25"
exclude = paste(country$country[country$country != "義大利"],collapse="|")
italy = data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("義大利", data_full$sentence)]
data %>% filter(artUrl %in% italy) %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
## Joining, by = "word"
義大利在2月討論聲量低,在3月開始疫情爆發
#新增一欄位紀錄國家類別
data$country = ""
data$country[data$artUrl %in% taiwan] = "taiwan"
data$country[data$artUrl %in% china] = "china"
data$country[data$artUrl %in% korea] = "korea"
data$country[data$artUrl %in% italy] = "italy"
將正負面情緒分開看
data %>% filter(country != "") %>%
select(artDate,word,count,country) %>%
inner_join(LIWC) %>%
filter(sentiment == "positive") %>%
group_by(artDate,sentiment,country) %>%
summarise(count=sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=country))+
scale_x_date(labels = date_format("%m/%d"))
## Joining, by = "word"
data %>% filter(country != "") %>%
select(artDate,word,count,country) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artDate,sentiment,country) %>%
summarise(count=sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=country))+
scale_x_date(labels = date_format("%m/%d"))
## Joining, by = "word"
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.