主題

分析PTT八卦版對太魯閣號出軌事件的文字資料

動機與分析目的

  • 背景與動機
    • 清明連假正是大家準備開心祭祖或是出遊的日子,大眾交通工具成為提供大家往返的工具,尤其是台鐵鐵路運輸的環島交通網,更是往來西岸與東岸最便利的方式,連假的第一天是交通最繁忙的一天,台鐵的太魯閣號卻因為施工單位的疏忽發生了嚴重的出軌意外,造成247輕重傷、49人死亡的悲劇;這場意外究竟是單純外包施工廠商,抑或是積習已久的台灣鐵路公司監督不周所造成的。
  • 研究目的
    1. 了解民眾對於太魯閣號事件的態度。
    2. 網民認為太魯閣號的責任歸屬。
    3. 太魯閣號出軌事件後該做的事。

資料集描述

  • 資料來源:中山大學管理學院文字分析平台收集PT T八卦版文章取得之原始csv檔案。
  • 資料集:PPT八卦版。
  • 資料日期區間:2021.03.31~2021.04.14。
  • 資料的關鍵字:檢索「台鐵」、「李義祥」、「太魯閣」、「工程車」、「義祥」、「出軌」六個關鍵字,共搜尋出1485篇文章。

ch 01前置作業

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

安裝需要的packages

packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

將require及library載入

require(data.table)
require(dplyr)
require(readr)
require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(NLP)
require(tidyr)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)

library(data.table)
require(wordcloud2)
require(lubridate)
require(htmlwidgets)
require(webshot)
require(plotly)
require( RColorBrewer)

require(servr)
require(tm)
require(data.table)
require(stringr)
library(topicmodels)
require(LDAvis)
require(webshot)

載入自平台下載下來的資料

rd <- read_csv("太魯閣事件_artWordFreq.csv")
rd2 <- read_csv("太魯閣事件_articleMetaData.csv")

rd$artDate <- rd$artDate %>% as.Date("%Y/%m/%d")
rd

ch 02 折線圖

資料處理

data <- rd %>% 
  dplyr::select(artDate, artUrl) %>% 
  distinct()
article_count_by_date <- data %>% 
  group_by(artDate) %>% 
  summarise(count = n())
head(article_count_by_date, 20)

太魯閣事件在PTT八卦版3/31~4/14聲量折線圖

plot_date <- 
  # data
  article_count_by_date %>% 
  # aesthetics
  ggplot(aes(x = artDate, y = count)) +
  # geometrics
  geom_line(color = "#00AFBB", size = 1) + 
  # coordinates
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("PTT 八卦版 討論文章數") + 
  xlab("日期") + 
  ylab("數量") + 
  # theme
  theme() #加入中文字型設定,避免中文字顯示錯誤。

plot_date

ch 03 文字雲

資料處理

data <- rd %>% 
  group_by(word) %>% 
  summarise(sum = sum(count), .groups = 'drop') %>% 
  arrange(desc(sum))
head(data)

太魯閣事件在PTT八卦版3/31~4/14文字雲

data %>% filter(sum > 50) %>% wordcloud2()

ch 04 常出現的詞彙

資料處理

rd_tokens <- rd %>%
  select(-artTime, -artUrl)
head(rd_tokens)
rd_tokens_by_date <- rd_tokens %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)

太魯閣事件在PTT八卦版4/2~4/10每天最常見的10個詞彙

stop_words <- scan(file = "dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
plot_merge <- rd_tokens_by_date %>% 
  filter(!(word %in% stop_words) & !(word %in% "台鐵")) %>%
  filter(artDate == as.Date("2021-04-02") | 
         artDate == as.Date("2021-04-03") | 
         artDate == as.Date("2021-04-04") |
         artDate == as.Date("2021-04-05") | 
         artDate == as.Date("2021-04-06") | 
         artDate == as.Date("2021-04-07") | 
         artDate == as.Date("2021-04-08") | 
         artDate == as.Date("2021-04-09") | 
         artDate == as.Date("2021-04-10")) %>% 
  group_by(artDate) %>% 
  top_n(10, n) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x=word, y=n, fill = artDate)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL) +
  facet_wrap(~artDate, scales="free", ncol = 5) + 
  coord_flip()+
  theme(text = element_text())
plot_merge

ch 05 情緒分析

載入情緒分析字典

# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/positive.txt")

# 負向字典txt檔
N <- read_file("dict/negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
rd_tokens_by_date %>%
  inner_join(LIWC) %>%
  select(word) %>%
  inner_join(LIWC) 
sentiment_count = rd_tokens_by_date %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))

初始化斷詞引擎

# 加入自定義的字典
jieba_tokenizer <- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")

# 設定斷詞function
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

太魯閣事件在PTT八卦版3/31~4/14正負面情緒聲量折線圖


sentiment_count %>%
  ggplot() +
  geom_line(aes(x=artDate,y=count,colour=sentiment)) +
  labs(x=NULL,y="數量")

  scale_x_date(labels = date_format("%m/%d"))  
rd_tokens_all <- rd2 %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)

太魯閣事件在PTT八卦版4/2~4/8正負情緒長條圖

rd_tokens_all %>%
  filter(artDate == as.Date("2021-04-02") |
         artDate == as.Date("2021-04-03") | 
         artDate == as.Date("2021-04-04") | 
         artDate == as.Date("2021-04-05") |
         artDate == as.Date("2021-04-06") |
         artDate == as.Date("2021-04-07") |
         artDate == as.Date("2021-04-08") ) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  labs(x= "文字", y="數量") +
  facet_wrap(~sentiment, scales = "free_y") +
  theme(text=element_text(size=14))+
  coord_flip()
ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}
g_ngram_11 <- rd2 %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngrams_11_separated <- g_ngram_11 %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
head(g_ngrams_11_separated)
g_check_words <- g_ngrams_11_separated %>%
  filter((word6 == "太魯閣"))
g_check_words

太魯閣事件在PTT八卦版出現在「太魯閣」附近的字

g_check_words_count <- g_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
g_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("出現在「太魯閣」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text())
ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}
g_ngram_11 <- rd2 %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngrams_11_separated <- g_ngram_11 %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
g_ngrams_11_separated
g_check_words <- g_ngrams_11_separated %>%
  filter((word6 == "台鐵"))
g_check_words

太魯閣事件在PTT八卦版出現在「台鐵」附近的字

g_check_words_count <- g_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
g_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("出現在「台鐵」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text())
# 進行斷詞,並計算各詞彙在各文章中出現的次數
rd_words <- rd2 %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
head(rd_words)
# 計算每篇文章包含的詞數
total_words <- rd_words %>% 
  group_by(artUrl) %>% 
  summarize(total = sum(n))
head(total_words)
# 合併 mask_words(每個詞彙在每個文章中出現的次數)
# 與 total_words(每篇文章的詞數)
# 新增各個詞彙在所有詞彙中的總數欄位
rd_words <- left_join(rd_words, total_words)
head(rd_words)
# 以每篇文章爲單位,計算每個詞彙在的tf-idf值
rd_words_tf_idf <- rd_words %>%
  bind_tf_idf(word, artUrl, n)
#rd_words_tf_idf
# 選出每篇文章,tf-idf最大的十個詞
rd_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(artUrl))
# 選每篇文章,tf-idf最大的十個詞,
# 並查看每個詞被選中的次數
rd_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(artUrl)) %>%
  ungroup() %>%
  count(word, sort=TRUE)
# 計算兩個詞彙間的相關性
stop_words <- scan(file = "dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
word_cors <- rd_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  filter(!(word %in% stop_words)&&!(word %in% "記者")) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

#word_cors
# 與太魯閣相關性高的詞彙
word_cors %>%
  filter(item1 == "太魯閣") %>% 
  head(5)
# 與義祥相關性高的詞彙
word_cors %>%
  filter(item1 == "義祥") %>% 
  head(5)
# 與台鐵相關性高的詞彙
word_cors %>%
  filter(item1 == "台鐵") %>% 
  head(5)

太魯閣事件在PTT八卦版出現在「 “台鐵”, “太魯閣”, “政府”, “義祥” 」相關性最高的 10 個詞彙

# 分別尋找與 "台鐵", "太魯閣", "政府", "義祥" 相關性最高的 10 個詞彙
word_cors %>%
  filter(item1 %in% c("台鐵", "太魯閣", "政府", "義祥")) %>%
  group_by(item1) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  labs(x=NULL,y=NULL) + 
  facet_wrap(~ item1, scales = "free") +
  coord_flip()+ 
  theme(text = element_text())

ch 06 共線相關圖

太魯閣事件在PTT八卦版共線相關圖

# 顯示相關性大於0.4的組合
set.seed(2020)
word_cors %>%
  filter(correlation > 0.4) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE) + #加入中文字型設定,避免中文字顯示錯誤。
  theme_void()
# 顯示相關性大於0.5的組合
set.seed(2020)

word_cors %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

結論

在整個事件中,大眾認為太魯閣號的責任應歸咎於台鐵和政府,並希望在太魯閣事件後台鐵能效法高鐵將公司民營化或公司化,避免未來再有憾事發生。

---
title: "分析PTT八卦版對太魯閣號出軌事件的文字資料"
author: "N094020017 林祐新, N094020034 蔡芸平, N094020032 鄒岱佑, N094020001 宋明毅"
output: 
  html_notebook:
    toc: yes
    toc_float: yes
    highlight: pygments
    theme: flatly
    css: style.css
  html_document:
    toc: yes
    df_print: paged
---
## 主題
> 分析PTT八卦版對太魯閣號出軌事件的文字資料

# 動機與分析目的
- 背景與動機
  + 清明連假正是大家準備開心祭祖或是出遊的日子，大眾交通工具成為提供大家往返的工具，尤其是台鐵鐵路運輸的環島交通網，更是往來西岸與東岸最便利的方式，連假的第一天是交通最繁忙的一天，台鐵的太魯閣號卻因為施工單位的疏忽發生了嚴重的出軌意外，造成247輕重傷、49人死亡的悲劇；這場意外究竟是單純外包施工廠商，抑或是積習已久的台灣鐵路公司監督不周所造成的。

+ 研究目的
   1. 了解民眾對於太魯閣號事件的態度。</br>
   2. 網民認為太魯閣號的責任歸屬。</br>
   3. 太魯閣號出軌事件後該做的事。</br>

## 資料集描述
+ 資料來源：中山大學管理學院文字分析平台收集PT T八卦版文章取得之原始csv檔案。
+ 資料集：PPT八卦版。
+ 資料日期區間：2021.03.31~2021.04.14。
+ 資料的關鍵字:檢索「台鐵」、「李義祥」、「太魯閣」、「工程車」、「義祥」、「出軌」六個關鍵字，共搜尋出1485篇文章。

# ch 01前置作業
```{r}
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
```

## 安裝需要的packages
```{r}
packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
```

## 將require及library載入
```{r}
require(data.table)
require(dplyr)
require(readr)
require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(NLP)
require(tidyr)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)

library(data.table)
require(wordcloud2)
require(lubridate)
require(htmlwidgets)
require(webshot)
require(plotly)
require( RColorBrewer)

require(servr)
require(tm)
require(data.table)
require(stringr)
library(topicmodels)
require(LDAvis)
require(webshot)
```

## 載入自平台下載下來的資料
```{r}
rd <- read_csv("太魯閣事件_artWordFreq.csv")
rd2 <- read_csv("太魯閣事件_articleMetaData.csv")

rd$artDate <- rd$artDate %>% as.Date("%Y/%m/%d")
rd
```


# ch 02 折線圖
## 資料處理
```{r}
data <- rd %>% 
  dplyr::select(artDate, artUrl) %>% 
  distinct()
```

```{r}
article_count_by_date <- data %>% 
  group_by(artDate) %>% 
  summarise(count = n())
```

```{r}
head(article_count_by_date, 20)
```


## 太魯閣事件在PTT八卦版3/31~4/14聲量折線圖
```{r}
plot_date <- 
  # data
  article_count_by_date %>% 
  # aesthetics
  ggplot(aes(x = artDate, y = count)) +
  # geometrics
  geom_line(color = "#00AFBB", size = 1) + 
  # coordinates
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("PTT 八卦版 討論文章數") + 
  xlab("日期") + 
  ylab("數量") + 
  # theme
  theme() #加入中文字型設定，避免中文字顯示錯誤。

plot_date
```

# ch 03 文字雲
## 資料處理
```{r}
data <- rd %>% 
  group_by(word) %>% 
  summarise(sum = sum(count), .groups = 'drop') %>% 
  arrange(desc(sum))
```

```{r}
head(data)
```

## 太魯閣事件在PTT八卦版3/31~4/14文字雲
```{r}
data %>% filter(sum > 50) %>% wordcloud2()
```


# ch 04 常出現的詞彙
## 資料處理
```{r}
rd_tokens <- rd %>%
  select(-artTime, -artUrl)
head(rd_tokens)
```


```{r}
rd_tokens_by_date <- rd_tokens %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)
```

## 太魯閣事件在PTT八卦版4/2~4/10每天最常見的10個詞彙
```{r}
stop_words <- scan(file = "dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
plot_merge <- rd_tokens_by_date %>% 
  filter(!(word %in% stop_words) & !(word %in% "台鐵")) %>%
  filter(artDate == as.Date("2021-04-02") | 
         artDate == as.Date("2021-04-03") | 
         artDate == as.Date("2021-04-04") |
         artDate == as.Date("2021-04-05") | 
         artDate == as.Date("2021-04-06") | 
         artDate == as.Date("2021-04-07") | 
         artDate == as.Date("2021-04-08") | 
         artDate == as.Date("2021-04-09") | 
         artDate == as.Date("2021-04-10")) %>% 
  group_by(artDate) %>% 
  top_n(10, n) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x=word, y=n, fill = artDate)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL) +
  facet_wrap(~artDate, scales="free", ncol = 5) + 
  coord_flip()+
  theme(text = element_text())
plot_merge
```


# ch 05 情緒分析
## 載入情緒分析字典
```{r}
# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/positive.txt")

# 負向字典txt檔
N <- read_file("dict/negative.txt")
```

```{r}
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
```

```{r}
LIWC = rbind(P, N)
```

```{r}
rd_tokens_by_date %>%
  inner_join(LIWC) %>%
  select(word) %>%
  inner_join(LIWC) 
```

```{r}
sentiment_count = rd_tokens_by_date %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))
```

## 初始化斷詞引擎
```{r}
# 加入自定義的字典
jieba_tokenizer <- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")

# 設定斷詞function
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
```

## 太魯閣事件在PTT八卦版3/31~4/14正負面情緒聲量折線圖
```{r}

sentiment_count %>%
  ggplot() +
  geom_line(aes(x=artDate,y=count,colour=sentiment)) +
  labs(x=NULL,y="數量")

  scale_x_date(labels = date_format("%m/%d"))  
```
```{r}
rd_tokens_all <- rd2 %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)
```

## 太魯閣事件在PTT八卦版4/2~4/8正負情緒長條圖
```{r}
rd_tokens_all %>%
  filter(artDate == as.Date("2021-04-02") |
         artDate == as.Date("2021-04-03") | 
         artDate == as.Date("2021-04-04") | 
         artDate == as.Date("2021-04-05") |
         artDate == as.Date("2021-04-06") |
         artDate == as.Date("2021-04-07") |
         artDate == as.Date("2021-04-08") ) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  labs(x= "文字", y="數量") +
  facet_wrap(~sentiment, scales = "free_y") +
  theme(text=element_text(size=14))+
  coord_flip()
```




```{r}
ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}
g_ngram_11 <- rd2 %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngrams_11_separated <- g_ngram_11 %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
head(g_ngrams_11_separated)
```

```{r}
g_check_words <- g_ngrams_11_separated %>%
  filter((word6 == "太魯閣"))
g_check_words
```

## 太魯閣事件在PTT八卦版出現在「太魯閣」附近的字
```{r}
g_check_words_count <- g_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
g_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("出現在「太魯閣」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text())
```

```{r}
ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}
g_ngram_11 <- rd2 %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngrams_11_separated <- g_ngram_11 %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
g_ngrams_11_separated
```

```{r}
g_check_words <- g_ngrams_11_separated %>%
  filter((word6 == "台鐵"))
g_check_words
```

## 太魯閣事件在PTT八卦版出現在「台鐵」附近的字
```{r}
g_check_words_count <- g_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
g_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("出現在「台鐵」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text())
```

```{r}
# 進行斷詞，並計算各詞彙在各文章中出現的次數
rd_words <- rd2 %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
head(rd_words)
```

```{r}
# 計算每篇文章包含的詞數
total_words <- rd_words %>% 
  group_by(artUrl) %>% 
  summarize(total = sum(n))
head(total_words)
```

```{r}
# 合併 mask_words（每個詞彙在每個文章中出現的次數）
# 與 total_words（每篇文章的詞數）
# 新增各個詞彙在所有詞彙中的總數欄位
rd_words <- left_join(rd_words, total_words)
```

```{r}
head(rd_words)
```

```{r}
# 以每篇文章爲單位，計算每個詞彙在的tf-idf值
rd_words_tf_idf <- rd_words %>%
  bind_tf_idf(word, artUrl, n)
#rd_words_tf_idf
```

```{r}
# 選出每篇文章，tf-idf最大的十個詞
rd_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(artUrl))
```

```{r}
# 選每篇文章，tf-idf最大的十個詞，
# 並查看每個詞被選中的次數
rd_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(artUrl)) %>%
  ungroup() %>%
  count(word, sort=TRUE)
```

```{r}
# 計算兩個詞彙間的相關性
stop_words <- scan(file = "dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
word_cors <- rd_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  filter(!(word %in% stop_words)&&!(word %in% "記者")) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

#word_cors
```

```{r}
# 與太魯閣相關性高的詞彙
word_cors %>%
  filter(item1 == "太魯閣") %>% 
  head(5)
```

```{r}
# 與義祥相關性高的詞彙
word_cors %>%
  filter(item1 == "義祥") %>% 
  head(5)
```

```{r}
# 與台鐵相關性高的詞彙
word_cors %>%
  filter(item1 == "台鐵") %>% 
  head(5)
```

## 太魯閣事件在PTT八卦版出現在「 "台鐵", "太魯閣", "政府", "義祥" 」相關性最高的 10 個詞彙
```{r}
# 分別尋找與 "台鐵", "太魯閣", "政府", "義祥" 相關性最高的 10 個詞彙
word_cors %>%
  filter(item1 %in% c("台鐵", "太魯閣", "政府", "義祥")) %>%
  group_by(item1) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  labs(x=NULL,y=NULL) + 
  facet_wrap(~ item1, scales = "free") +
  coord_flip()+ 
  theme(text = element_text())
```



# ch 06 共線相關圖
## 太魯閣事件在PTT八卦版共線相關圖
```{r}
# 顯示相關性大於0.4的組合
set.seed(2020)
word_cors %>%
  filter(correlation > 0.4) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE) + #加入中文字型設定，避免中文字顯示錯誤。
  theme_void()
```

```{r}
# 顯示相關性大於0.5的組合
set.seed(2020)

word_cors %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()
```


# 結論
>在整個事件中，大眾認為太魯閣號的責任應歸咎於台鐵和政府，並希望在太魯閣事件後台鐵能效法高鐵將公司民營化或公司化，避免未來再有憾事發生。

