系統參數設定

Sys.setlocale(category = "LC_ALL", locale = 'UTF-8' ) # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "UTF-8"): 作業系統回報無
## 法實現設定語區為 "UTF-8" 的要求
## [1] ""

載入需要的packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(wordcloud)
## Loading required package: RColorBrewer
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
library(readr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor

資料來源: 文字平台收集PTT Grossiping版2020-01-01 ~ 2020-03-31 所有文章

資料集: corona_artWordFreq.csv

setwd('D:/OC Learn/NSYSU/Social Media Analysis/mask')
mask = fread('../mask/mask_artWordPOSFreq.csv',encoding = 'UTF-8')
head(mask)

過濾特殊字元

mask = mask %>% 
  filter(!grepl('_',word))
head(mask)

轉換日期格式

mask$artDate= mask$artDate %>% as.Date("%Y/%m/%d")
head(mask)

計算所有字在文集中的總詞頻

word_count <- mask %>%
  select(word,count) %>% #計算全部文章中各字詞出現的次數
  group_by(word) %>% #以字詞為組別
  summarise(count = sum(count))  %>%  #計算字詞出現次數的總和
  filter(count>50) %>%  # 過濾出現太少次的字,即選擇字詞出現次數超出設定次數
  arrange(desc(count)) #遞減排序
head(word_count)

以LIWC字典判斷文集中的word屬於正面字還是負面字

準備LIWC字典

setwd('D:/OC Learn/NSYSU/Social Media Analysis/mask')
P <- read_file("../mask/liwc/positive.txt") # 正向字典txt檔
N <- read_file("../mask/liwc/negative.txt")# 負向字典txt檔
typeof(P)
## [1] "character"
typeof(N)
## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)

與LIWC情緒字典做join

文集中的字出現在LIWC字典中是屬於positive還是negative

word_count %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
mask %>% 
  select(word) %>%
  inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

#以LIWC情緒字典分析

統計每天的文章正面字的次數與負面字的次數

sentiment_count = mask %>%
  select(artDate,word,count) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

畫出各天情緒圖

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d")) 

# 以圖來看,正面與負面情緒在伯仲之間,但大致以負面多於正面
# 可比對台灣疫情確診數,或各國疫情確診數
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))+
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/02/03'))
[1]])),colour = "red") +
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/01/21'))
[1]])),colour = "blue") +
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/03/17'))
[1]])),colour = "yellow") 

#geom_vline畫出vertical line,xintercept告訴他要在artDate欄位的哪一個row畫線

透過觀察情緒變化來回顧事件內容

mask %>% filter(artDate == as.Date('2020/02/03')) %>% distinct(artUrl, .keep_all = TRUE)
mask %>% 
  filter(artDate == as.Date('2020/02/03')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>30) %>%   # 過濾出現太少次的字
  filter(!(word %in% c("口罩","台灣","武漢","中國"))) %>% #排除原本搜尋的關鍵字「口罩」及地名「台灣」、「中國」、「武漢」
  wordcloud2()
# 以新冠病毒疫情來這些字詞,並沒特別可觀察的部分

看負面情緒最高的2020/2/3哪篇文章的負面情緒最多?負面情緒的字是?

mask %>% 
  filter(artDate == as.Date('2020/02/03')) %>%
  inner_join(LIWC) %>% 
  filter(sentiment == "negative") %>% 
  group_by(artUrl,sentiment) %>% 
  summarise(
    artTitle = artTitle[1],
    count = n()
  ) %>% 
  arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
mask %>%
  filter(artDate == as.Date('2020/02/03')) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

看議題開始被大量討論的2020/1/21哪篇文章的負面情緒最多?負面情緒的字是?

mask %>% 
  filter(artDate == as.Date('2020/01/21')) %>%
  inner_join(LIWC) %>% 
  filter(sentiment == "negative") %>% 
  group_by(artUrl,sentiment) %>% 
  summarise(
    artTitle = artTitle[1],
    count = n()
  ) %>% 
  arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
mask %>%
  filter(artDate == as.Date('2020/01/21')) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

看議題討論相對較低的2020/3/17哪篇文章的負面情緒最多?負面情緒的字是?

mask %>% 
  filter(artDate == as.Date('2020/03/17')) %>%
  inner_join(LIWC) %>% 
  filter(sentiment == "negative") %>% 
  group_by(artUrl,sentiment) %>% 
  summarise(
    artTitle = artTitle[1],
    count = n()
  ) %>% 
  arrange(desc(count))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
mask %>%
  filter(artDate == as.Date('2020/03/17')) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector