動機與分析目的
(1)背景動機 : 最近華航機師染疫,到處趴趴走,去過清真寺等地點,造成民眾恐慌,多人必須隔離檢疫,台科大的學生甚至因此停課!👀😡

(2)研究目的: 因此以PTT 八卦版中的討論情況,來分析民眾針對此事件的看法與反應,進行探討研究。
(3)資料來源 : PTT八卦版(2020/10/28~2021/4/27),透過文字分析平台,關鍵字【華航】
1.安裝package
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','scales','widyr','igraph','ggraph','tidyr','NLP')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(readr)
library(scales)
library(jiebaR)
library(widyr)
library(igraph)
library(ggraph)
library(lubridate)
library(NLP)
library(tidyr)
library(devtools)
Error in get(genname, envir = envir) : 找不到物件 'testthat_print'
2.資料收集:PTT八卦版,2020/10/28 ~ 2021/04/27
利用平台抓取關鍵字:華航,然後匯入資料(時間+8小時)
MetaData = fread('./aaaa_articleMetaData.csv',encoding = 'UTF-8')
Reviews = fread('./aaaa_articleReviews.csv',encoding = 'UTF-8')
MetaData = MetaData %>% mutate(date_total=paste(artDate,artTime,sep = " "))#由於資料是格林威治時間,和台灣時間相差八小時,時間必須+8
MetaData$date_total = as.POSIXct(MetaData$date_total,format = "%Y/%m/%d %H:%M:%S")
MetaData$date_total = MetaData$date_total + 28800
MetaData$artDate=as_date(MetaData$date_total)
MetaData$artDate= MetaData$artDate%>% as.Date("%Y/%m/%d")
Reviews = Reviews %>% mutate(date_total=paste(artDate,artTime,sep = " "))
Reviews$date_total = as.POSIXct(Reviews$date_total,format = "%Y/%m/%d %H:%M:%S")
Reviews$date_total = Reviews$date_total + 28800
Reviews$artDate=as_date(Reviews$date_total)
Reviews$artDate= Reviews$artDate%>% as.Date("%Y/%m/%d")
MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot()+
geom_line(color = "blue",aes(x=artDate,y=count))+
geom_vline(xintercept = c(as.numeric(as.Date("2020-12-15")),
as.numeric(as.Date("2021-04-23"))), col='red', size = 0.8) +
scale_x_date(labels = date_format("%Y/%m/%d"))+ggtitle("華航討論文章數") +theme(text = element_text(family='STHeitiTC-Light'))+xlab("日期")+ylab("數量")

得知:
(1)2020/12/15討論文章數最多,其次是2021/04/23
(2)2020/12/15 [新聞]華航公布777F新塗裝CARGO內藏台灣圖樣與china的文字,可能會與中國搞混,造成意識形態混亂的問題,引發熱烈討論。
(3)2020/04/23[新聞]確診的華航印尼籍機師到清真寺參加活動,當日活動參與人數逾400人
3.資料清理
首先把推文和內容的資料合併
##用於資料清理
Reviews2 = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
#設定斷詞器
user<- scan(file = "./user_dict.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8',quiet = T)
stop_words <- scan(file = "./stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
Read 1281 items
jieba_tokenizer = worker()
new_user_word(jieba_tokenizer, c(user))
[1] TRUE
# 設定斷詞function
gossip_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
return(tokens)
})
}
把文章和留言的斷詞結果併在一起
MToken <- MetaData %>% unnest_tokens(word, sentence, token = gossip_tokenizer)
RToken <- Reviews2 %>% unnest_tokens(word, cmtContent, token = gossip_tokenizer)
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")])
格式化日期
data$artDate <- as.Date(data$artDate)
data_select = data %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
查看篇數最多的那兩天裡面最常出現的詞彙
gossip_tokens_by_date <- data_select %>%
count(artDate, word, sort = TRUE)
plot_merge <- gossip_tokens_by_date %>%
filter(artDate == as.Date("2020-12-15") |
artDate == as.Date("2021-04-23")) %>%
group_by(artDate) %>%
top_n(7, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n, fill = artDate)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = NULL) +
facet_wrap(~artDate, scales="free", ncol = 2) +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
plot_merge

4.文字雲
接下來我們來討論在這半年中,八卦版對華航的討論為何,算出全部字的詞頻後,畫出文字雲
set.seed(100)
data_select %>%
select(word) %>%
group_by(word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(word != "華航" & word != "台灣") %>%
arrange(desc(count)) %>%
head(120)%>% wordcloud2()->ppp

一些出現頻率比較高的如 : 機師、中國、長榮、英國、確診、口罩、防疫……
- 機師:4/23時去清真寺的消息
- 中國:在新冠肺炎、華航改名的討論時,常會一起出現
- 長榮:常與華航一起做比較,如:疫苗的施打率(長榮比華航多)
- 英國:變種病毒的來源
- 確診、口罩、來源:疫情期間常討論的話題
5.情緒分析
使用LIWC字典
P = read_file("./liwc/positive.txt") # 正向字典txt檔
N = read_file("./liwc/negative.txt") # 負向字典txt檔
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
LIWC = rbind(P, N)
算出每天情緒總和(sentiment_count),並畫出日期波動圖
sentiment_count = data_select %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) %>%
arrange(desc(count))
Joining, by = "word"
`summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
#range(sentiment_count$artDate) #"2020-10-28" "2021-04-26"
#設定y軸刻度
datebreaks = seq(as.Date("2020-10-28"), as.Date("2021-04-26"),by = "1 month")
#沒有線的情緒分數
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(breaks = datebreaks)+
theme(axis.text.x = element_text(angle = 30, hjust = 1))+ggtitle("情緒分佈") +theme(text = element_text(family='STHeitiTC-Light'))+xlab("日期")+ylab("數量")

由上圖觀察可得2020/12/15、2020/04/23,情緒波動比較大
畫出2020/12/15、2020/04/23日期線
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(breaks = datebreaks)+
theme(axis.text.x = element_text(angle = 30, hjust = 1))+
geom_vline(xintercept = c(as.numeric(as.Date("2021-04-23")),
as.numeric(as.Date("2020-12-14"))), col='blue', size = 1)

在討論到華航時,幾乎都是負面情緒大於正面情緒
這兩日最常出現的正負面情緒字
data_sentiment <- data_select %>%
filter(artDate == as.Date("2020-12-15") |
artDate == as.Date("2021-04-23") ) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14,family = 'STHeitiTC-Light'))+
coord_flip()
Joining, by = "word"
`summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

正負情緒文字雲
par(family=("Heiti TC Light"))
data_sentiment$data %>%
acast(word ~ sentiment, value.var = "count", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"), # positive negative
max.words = 50)

分析2021/04/23前後五天的情緒
sentiment_count %>% filter(artDate<=as.Date("2021-04-25",format="%Y-%m-%d"))%>%
# 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-04-20','2021-04-25'))
)+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-04-23'))[1]])),colour = "black",linetype=4)

可以看到4/23事件爆發後,由於指揮中心持續公布新的染疫者足跡地點,所以負面情緒越來越高漲,正面情緒比例一直下降,代表民眾真的很憤怒也很恐慌!
6.TF-IDF
計算每一篇文章的詞數
gossip_word = data_select %>% count(artUrl, word, sort = TRUE)
total_words = gossip_word %>%
group_by(artUrl) %>%
summarize(total = sum(n)) %>%
arrange(desc(total))
total_words
合併需要的資料欄位,計算詞彙的 tf-idf 值
gossip_word = left_join(gossip_word, total_words)
Joining, by = "artUrl"
# 以每篇文章爲單位,計算每個詞彙的 tf-idf 值
gossip_words_tf_idf = gossip_word %>%
bind_tf_idf(word, artUrl, n) %>%
group_by(artUrl) %>%
slice_max(tf_idf, n=5) %>%
arrange(desc(artUrl))
gossip_words_tf_idf
計算整個文集中 tf-idf 值高的字
gossip_words_tf_idf %>%
group_by(artUrl) %>%
slice_max(tf_idf, n=10) %>%
ungroup() %>%
count(word, sort=TRUE)
- 機師:染疫事件的主角
- 英國:變種病毒
- 禮拜五:染疫事件爆發4/23是星期五
計算兩個詞彙同時出現的總次數
移除PTT貼新聞時會出現的格式
word_pairs <- gossip_word %>%
filter(word!="新聞標題" & word!="來源" & word!="違者" & word!="刪除"& word!="署名"& word!="連結"& word!="請放"& word!="完整"& word!="內文"&word!="備註"& word!="媒體"& word!="記者")%>%
pairwise_count(word, artUrl, sort = TRUE)
`distinct_()` was deprecated in dplyr 0.7.0.
Please use `distinct()` instead.
See vignette('programming') for more help`tbl_df()` was deprecated in dplyr 1.0.0.
Please use `tibble::as_tibble()` instead.
算出字詞的相關性
word_cors <- gossip_word %>%
filter(word!="新聞標題" & word!="來源" & word!="違者" & word!="刪除"& word!="署名"& word!="連結"& word!="請放"& word!="完整"& word!="內文"&word!="備註"& word!="媒體"& word!="記者")%>%
group_by(word) %>%
filter(n() >= 10) %>%
pairwise_cor(word, artUrl, sort = TRUE)
word_cors
7.bigram
Metadata2 <- MetaData %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>% #換行、空格都用句號取代
mutate(sentence=gsub("\n", "", sentence)) %>%
mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))
Metadata2
移除PTT貼新聞時會出現的格式用字
Metadata2 = Metadata2 %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除", "", sentence))
bigram function
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
執行bigram分詞
data_bigram <- Metadata2 %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
#data_bigram
data_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
count(word1, word2, sort = TRUE) %>%
unite_("bigram", c("word1","word2"), sep=" ")
8.Trigram
trigram function
jieba_trigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
ngram<- ngrams(unlist(tokens), 3)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
}
})
}
執行trigram分詞
data_trigram <- Metadata2 %>%
unnest_tokens(ngrams, sentence, token = jieba_trigram)
#data_trigram
data_trigram %>%
filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
count(word1, word2, word3, sort = TRUE) %>%
unite_("ngrams", c("word1", "word2", "word3"), sep=" ")
9.共線圖
畫出共線圖(correlation > 0.5)
set.seed(666)
word_cors %>%
filter(correlation > 0.5 ) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()

畫出共線圖(correlation > 0.7)
set.seed(666)
word_cors %>%
filter(correlation > 0.7 ) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()

10.結論
大部分民眾對於華航議題都處於負面情緒,12月左右都是在討論華航改名、飛機新塗裝的議題。4/23機師染疫風波發生之後,其染疫足跡擴及更多人,造成民眾的恐慌和憤怒,負面情緒更為高漲。
11.小發現
正常來說,當討論情緒都是負面的時候,一家公司的股價應該會下跌,但華航反而上漲,這是我們有趣的小發現:ptt反指標,嘻嘻😁😎
2020/12/15華航股價:11.70TWD

2021/04/23華航股價:20.40TWD😱

---
title: "社群媒體分析期中報告：分析華航機師染疫趴趴走事件"
author: "第五組 組員：M094020050吳佩玲 M094020062林濬紘 M094020036黃弈晴"
date: "2021/05/04"
output:
  html_notebook:
    toc: yes
    toc_float: yes
    highlight: pygments
    theme: flatly
    css: style.css
  html_document:
    toc: yes
    df_print: paged
---

# 動機與分析目的
> (1)背景動機 : 最近華航機師染疫，到處趴趴走，去過清真寺等地點，造成民眾恐慌，多人必須隔離檢疫，台科大的學生甚至因此停課!`r "\U1F440"``r "\U1F621"`
<br><br>
![](aaa.png)<br><br>
(2)研究目的: 因此以PTT 八卦版中的討論情況，來分析民眾針對此事件的看法與反應，進行探討研究。<br>
(3)資料來源 : PTT八卦版(2020/10/28~2021/4/27)，透過文字分析平台，關鍵字【華航】

# 1.安裝package

```{r echo = T, results = 'hide'}
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
```

```{r message=FALSE, warning=FALSE, paged.print=TRUE}
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','scales','widyr','igraph','ggraph','tidyr','NLP')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
```


```{r,warning=FALSE,message=FALSE}
library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(readr)
library(scales)
library(jiebaR)
library(widyr)
library(igraph)
library(ggraph)
library(lubridate)
library(NLP)
library(tidyr)
library(devtools)
```

# 2.資料收集：PTT八卦版，2020/10/28 ~ 2021/04/27

### 利用平台抓取關鍵字:華航，然後匯入資料(時間+8小時)
```{r}
MetaData = fread('./aaaa_articleMetaData.csv',encoding = 'UTF-8')
Reviews  = fread('./aaaa_articleReviews.csv',encoding = 'UTF-8')

MetaData = MetaData %>% mutate(date_total=paste(artDate,artTime,sep = " "))#由於資料是格林威治時間，和台灣時間相差八小時，時間必須+8
MetaData$date_total = as.POSIXct(MetaData$date_total,format = "%Y/%m/%d %H:%M:%S")
MetaData$date_total = MetaData$date_total + 28800
MetaData$artDate=as_date(MetaData$date_total)
MetaData$artDate= MetaData$artDate%>% as.Date("%Y/%m/%d")

Reviews = Reviews %>% mutate(date_total=paste(artDate,artTime,sep = " "))
Reviews$date_total = as.POSIXct(Reviews$date_total,format = "%Y/%m/%d %H:%M:%S")
Reviews$date_total = Reviews$date_total + 28800
Reviews$artDate=as_date(Reviews$date_total)
Reviews$artDate= Reviews$artDate%>% as.Date("%Y/%m/%d")
```


```{r}
MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot()+
    geom_line(color = "blue",aes(x=artDate,y=count))+
   geom_vline(xintercept = c(as.numeric(as.Date("2020-12-15")),
                            as.numeric(as.Date("2021-04-23"))), col='red', size = 0.8) +
    scale_x_date(labels = date_format("%Y/%m/%d"))+ggtitle("華航討論文章數") +theme(text = element_text(family='STHeitiTC-Light'))+xlab("日期")+ylab("數量")
```

### 得知: 
> (1)2020/12/15討論文章數最多，其次是2021/04/23<br>
> (2)2020/12/15 [新聞]華航公布777F新塗裝CARGO內藏台灣圖樣與china的文字，可能會與中國搞混，造成意識形態混亂的問題，引發熱烈討論。<br>
> (3)2020/04/23[新聞]確診的華航印尼籍機師到清真寺參加活動，當日活動參與人數逾400人<br>

## 3.資料清理
### 首先把推文和內容的資料合併
```{r}
##用於資料清理
Reviews2 = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
#設定斷詞器
user<- scan(file = "./user_dict.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8',quiet = T)
stop_words <- scan(file = "./stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')

jieba_tokenizer = worker()
new_user_word(jieba_tokenizer, c(user))
# 設定斷詞function
gossip_tokenizer <- function(t) {
    lapply(t, function(x) {
        tokens <- segment(x, jieba_tokenizer)
        tokens <- tokens[!tokens %in% stop_words]
        return(tokens)
    })
}

```

### 把文章和留言的斷詞結果併在一起
```{r}
MToken <- MetaData %>% unnest_tokens(word, sentence, token = gossip_tokenizer)
RToken <- Reviews2 %>% unnest_tokens(word, cmtContent, token = gossip_tokenizer)
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 
```


### 格式化日期
```{r}
data$artDate <- as.Date(data$artDate)
data_select = data %>% 
    filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
    filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
    filter(nchar(.$word)>1)
```


### 查看篇數最多的那兩天裡面最常出現的詞彙
```{r}
gossip_tokens_by_date <- data_select %>% 
  count(artDate, word, sort = TRUE)
plot_merge <- gossip_tokens_by_date %>% 
  filter(artDate == as.Date("2020-12-15") | 
         artDate == as.Date("2021-04-23")) %>% 
  group_by(artDate) %>% 
  top_n(7, n) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x=word, y=n, fill = artDate)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL) +
  facet_wrap(~artDate, scales="free", ncol = 2) + 
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))
plot_merge

```


## 4.文字雲
### 接下來我們來討論在這半年中，八卦版對華航的討論為何，算出全部字的詞頻後，畫出文字雲
```{r}
set.seed(100)
data_select %>%
    select(word) %>%
    group_by(word) %>%
    summarise(count=n()) %>%  # 算字詞單篇總數用summarise
    filter(word != "華航" & word != "台灣") %>%
    arrange(desc(count)) %>% 
    head(120)%>% wordcloud2()->ppp
```
![](ppp.png)

#### 一些出現頻率比較高的如 : 機師、中國、長榮、英國、確診、口罩、防疫......
+ 機師：4/23時去清真寺的消息
+ 中國：在新冠肺炎、華航改名的討論時，常會一起出現
+ 長榮：常與華航一起做比較，如：疫苗的施打率(長榮比華航多)
+ 英國：變種病毒的來源
+ 確診、口罩、來源：疫情期間常討論的話題

## 5.情緒分析
### 使用LIWC字典
```{r}
P = read_file("./liwc/positive.txt") # 正向字典txt檔
N = read_file("./liwc/negative.txt") # 負向字典txt檔
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
LIWC = rbind(P, N)
```

### 算出每天情緒總和(sentiment_count)，並畫出日期波動圖
```{r}
sentiment_count = data_select %>%
    select(artDate,word) %>%
    inner_join(LIWC) %>% 
    group_by(artDate,sentiment) %>%
    summarise(count=n()) %>% 
    arrange(desc(count))
#range(sentiment_count$artDate) #"2020-10-28" "2021-04-26"
#設定y軸刻度
datebreaks = seq(as.Date("2020-10-28"), as.Date("2021-04-26"),by = "1 month")
#沒有線的情緒分數
sentiment_count %>%
    ggplot()+
    geom_line(aes(x=artDate,y=count,colour=sentiment))+
    scale_x_date(breaks = datebreaks)+
    theme(axis.text.x = element_text(angle = 30, hjust = 1))+ggtitle("情緒分佈") +theme(text = element_text(family='STHeitiTC-Light'))+xlab("日期")+ylab("數量")

```

#### 由上圖觀察可得2020/12/15、2020/04/23，情緒波動比較大


### 畫出2020/12/15、2020/04/23日期線
```{r}
sentiment_count %>%
    ggplot()+
    geom_line(aes(x=artDate,y=count,colour=sentiment))+
    scale_x_date(breaks = datebreaks)+
    theme(axis.text.x = element_text(angle = 30, hjust = 1))+
    geom_vline(xintercept = c(as.numeric(as.Date("2021-04-23")),
                              as.numeric(as.Date("2020-12-14"))), col='blue', size = 1)
```

#### 在討論到華航時，幾乎都是負面情緒大於正面情緒

### 這兩日最常出現的正負面情緒字
```{r}
data_sentiment <- data_select %>%
  filter(artDate == as.Date("2020-12-15") | 
         artDate == as.Date("2021-04-23") ) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14,family = 'STHeitiTC-Light'))+
  coord_flip()
data_sentiment
```

### 正負情緒文字雲
```{r}
par(family=("Heiti TC Light"))
data_sentiment$data %>%
  acast(word ~ sentiment, value.var = "count", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"), # positive negative
                   max.words = 50)
```

### 分析2021/04/23前後五天的情緒
```{r}
sentiment_count %>%  filter(artDate<=as.Date("2021-04-25",format="%Y-%m-%d"))%>%
  # 標準化的部分
  group_by(artDate) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-04-20','2021-04-25'))
               )+
  # 加上標示日期的線
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-04-23'))[1]])),colour = "black",linetype=4)
```
#### 可以看到4/23事件爆發後，由於指揮中心持續公布新的染疫者足跡地點，所以負面情緒越來越高漲，正面情緒比例一直下降，代表民眾真的很憤怒也很恐慌！

## 6.TF-IDF
### 計算每一篇文章的詞數
```{r}
gossip_word = data_select %>% count(artUrl, word, sort = TRUE)
total_words = gossip_word %>% 
    group_by(artUrl) %>% 
    summarize(total = sum(n)) %>% 
    arrange(desc(total))
total_words
```

### 合併需要的資料欄位，計算詞彙的 tf-idf 值
```{r}
gossip_word = left_join(gossip_word, total_words)
# 以每篇文章爲單位，計算每個詞彙的 tf-idf 值
gossip_words_tf_idf = gossip_word %>%
    bind_tf_idf(word, artUrl, n) %>% 
    group_by(artUrl) %>%
    slice_max(tf_idf, n=5) %>% 
    arrange(desc(artUrl))
gossip_words_tf_idf
```

### 計算整個文集中 tf-idf 值高的字
```{r}
gossip_words_tf_idf %>% 
  group_by(artUrl) %>%
  slice_max(tf_idf, n=10) %>%
  ungroup() %>%
  count(word, sort=TRUE)
```
+ 機師：染疫事件的主角
+ 英國：變種病毒
+ 禮拜五：染疫事件爆發4/23是星期五


### 計算兩個詞彙同時出現的總次數
移除PTT貼新聞時會出現的格式
```{r}
word_pairs <- gossip_word %>%
    filter(word!="新聞標題" & word!="來源" & word!="違者" & word!="刪除"& word!="署名"& word!="連結"& word!="請放"& word!="完整"& word!="內文"&word!="備註"& word!="媒體"& word!="記者")%>%
    pairwise_count(word, artUrl, sort = TRUE)
word_pairs

```

### 算出字詞的相關性
```{r}
word_cors <- gossip_word %>%
    filter(word!="新聞標題" & word!="來源" & word!="違者" & word!="刪除"& word!="署名"& word!="連結"&  word!="請放"& word!="完整"& word!="內文"&word!="備註"& word!="媒體"& word!="記者")%>% 
    group_by(word) %>%
    filter(n() >= 10) %>%
    pairwise_cor(word, artUrl, sort = TRUE)
word_cors
```

## 7.bigram
```{r}
Metadata2 <- MetaData %>% 
  mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>% #換行、空格都用句號取代
  mutate(sentence=gsub("\n", "", sentence)) %>% 
  mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))
Metadata2
```

移除PTT貼新聞時會出現的格式用字
```{r}
Metadata2 = Metadata2 %>% 
  mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除", "", sentence))
```

bigram function
```{r}

jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}
```

執行bigram分詞
```{r}
data_bigram <- Metadata2 %>%
  unnest_tokens(bigram, sentence, token = jieba_bigram)
#data_bigram

data_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  count(word1, word2, sort = TRUE) %>%
  unite_("bigram", c("word1","word2"), sep=" ")
```


## 8.Trigram
trigram function
```{r}
jieba_trigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      ngram<- ngrams(unlist(tokens), 3)
      ngram <- lapply(ngram, paste, collapse = " ")
      unlist(ngram)
    }
  })
}
```

執行trigram分詞
```{r}
data_trigram <- Metadata2 %>%
  unnest_tokens(ngrams, sentence, token = jieba_trigram)
#data_trigram

data_trigram %>%
  filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
  separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
  count(word1, word2, word3, sort = TRUE) %>%
  unite_("ngrams", c("word1", "word2", "word3"), sep=" ")
```


## 9.共線圖
畫出共線圖(correlation > 0.5)
```{r}
set.seed(666)
word_cors %>%
    filter(correlation > 0.5 ) %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
    geom_node_point(color = "lightblue", size = 3) +
    geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定，避免中文字顯示錯誤。
    theme_void()
```

畫出共線圖(correlation > 0.7)
```{r}
set.seed(666)
word_cors %>%
    filter(correlation > 0.7 ) %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
    geom_node_point(color = "lightblue", size = 3) +
    geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定，避免中文字顯示錯誤。
    theme_void()
```

## 10.結論
> 大部分民眾對於華航議題都處於負面情緒，12月左右都是在討論華航改名、飛機新塗裝的議題。4/23機師染疫風波發生之後，其染疫足跡擴及更多人，造成民眾的恐慌和憤怒，負面情緒更為高漲。

## 11.小發現
>正常來說，當討論情緒都是負面的時候，一家公司的股價應該會下跌，但華航反而上漲，這是我們有趣的小發現：ptt反指標，嘻嘻`r "\U1F601"``r "\U1F60E"`


#### 2020/12/15華航股價：11.70TWD
![](1.png)<br>

#### 2021/04/23華航股價：20.40TWD`r "\U1F631"`
![](2.png)


