Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("readr", "dplyr","wordcloud2", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph","reshape2", "NLP","scales", "reshape2", "widyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(readr)
## Loading required package: readr
require(dplyr)
require(stringr)
## Loading required package: stringr
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(tidytext)
## Loading required package: tidytext
require(NLP)
## Loading required package: NLP
require(tidyr)
## Loading required package: tidyr
require(ggplot2)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
require(ggraph)
## Loading required package: ggraph
require(igraph)
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
require(scales)
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
require(reshape2)
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
require(widyr)
## Loading required package: widyr
require(knitr)
## Loading required package: knitr
library(data.table)
require(ropencc)
## Loading required package: ropencc
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'ropencc'
require(wordcloud2)
## Loading required package: wordcloud2
require(lubridate)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:igraph':
## 
##     %--%, union
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
require(htmlwidgets)
## Loading required package: htmlwidgets
require(webshot)
## Loading required package: webshot
require(plotly)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
require( RColorBrewer)
## Loading required package: RColorBrewer
require(servr)
## Loading required package: servr
require(tm)
## Loading required package: tm
require(data.table)
require(stringr)
library(topicmodels)
require(LDAvis)
## Loading required package: LDAvis
require(webshot)
rd <- read_csv("太魯閣事件_artWordFreq.csv") 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   word = col_character(),
##   count = col_double()
## )
rd2 <- read_csv("太魯閣事件_articleMetaData.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )
rd$artDate <- rd$artDate %>% as.Date("%Y/%m/%d")
rd
## # A tibble: 100,230 x 6
##    artTitle        artDate    artTime  artUrl                        word  count
##    <chr>           <date>     <time>   <chr>                         <chr> <dbl>
##  1 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 沒開      3
##  2 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 今年      1
##  3 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 首波      1
##  4 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 玩笑      1
##  5 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 竟然      1
##  6 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 台鐵整~     1
##  7 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 設好      1
##  8 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 鬧鐘      1
##  9 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 車票      1
## 10 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 日期      1
## # ... with 100,220 more rows
jieba_tokenizer = worker()

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
data <- rd %>% 
  dplyr::select(artDate, artUrl) %>% 
  distinct()
article_count_by_date <- data %>% 
  group_by(artDate) %>% 
  summarise(count = n())
head(article_count_by_date, 20)
## # A tibble: 15 x 2
##    artDate    count
##    <date>     <int>
##  1 2021-03-31     1
##  2 2021-04-01     1
##  3 2021-04-02   354
##  4 2021-04-03   205
##  5 2021-04-04   215
##  6 2021-04-05   121
##  7 2021-04-06   134
##  8 2021-04-07   200
##  9 2021-04-08    79
## 10 2021-04-09    80
## 11 2021-04-10    35
## 12 2021-04-11    17
## 13 2021-04-12    18
## 14 2021-04-13    20
## 15 2021-04-14     5
plot_date <- 
  # data
  article_count_by_date %>% 
  # aesthetics
  ggplot(aes(x = artDate, y = count)) +
  # geometrics
  geom_line(color = "#00AFBB", size = 1) + 
  # coordinates
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("PTT 八卦版 討論文章數") + 
  xlab("日期") + 
  ylab("數量") + 
  # theme
  theme() #加入中文字型設定,避免中文字顯示錯誤。

plot_date

data <- rd %>% 
  group_by(word) %>% 
  summarise(sum = sum(count), .groups = 'drop') %>% 
  arrange(desc(sum))
head(data)
## # A tibble: 6 x 2
##   word     sum
##   <chr>  <dbl>
## 1 台鐵    2600
## 2 事故     792
## 3 問題     759
## 4 工程     715
## 5 工程車   715
## 6 太魯閣   713
data %>% filter(sum > 50) %>% wordcloud2()
rd_tokens <- rd %>%
  select(-artTime, -artUrl)
head(rd_tokens)
## # A tibble: 6 x 4
##   artTitle               artDate    word   count
##   <chr>                  <date>     <chr>  <dbl>
## 1 [問卦]台鐵是在愚人節? 2021-03-31 沒開       3
## 2 [問卦]台鐵是在愚人節? 2021-03-31 今年       1
## 3 [問卦]台鐵是在愚人節? 2021-03-31 首波       1
## 4 [問卦]台鐵是在愚人節? 2021-03-31 玩笑       1
## 5 [問卦]台鐵是在愚人節? 2021-03-31 竟然       1
## 6 [問卦]台鐵是在愚人節? 2021-03-31 台鐵整     1
rd_tokens_by_date <- rd_tokens %>% 
  count(artDate, word, sort = TRUE) %>%
  filter(n > 5)
plot_merge <- rd_tokens_by_date %>% 
  filter(artDate == as.Date("2021-04-02") | 
         artDate == as.Date("2021-04-03") | 
         artDate == as.Date("2021-04-04") |
         artDate == as.Date("2021-04-05") | 
         artDate == as.Date("2021-04-06") | 
         artDate == as.Date("2021-04-07") | 
         artDate == as.Date("2021-04-08") | 
         artDate == as.Date("2021-04-09") | 
         artDate == as.Date("2021-04-10")) %>% 
  group_by(artDate) %>% 
  top_n(7, n) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x=word, y=n, fill = artDate)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL) +
  facet_wrap(~artDate, scales="free", ncol = 5) + 
  coord_flip()+
  theme(text = element_text())
plot_merge

# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/positive.txt")

# 負向字典txt檔
N <- read_file("dict/negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
rd_tokens_by_date %>%
  inner_join(LIWC) %>%
  select(word) %>%
  inner_join(LIWC) 
## Joining, by = "word"
## Joining, by = "word"
## # A tibble: 188 x 2
##    word  sentiment
##    <chr> <chr>    
##  1 事故  negative 
##  2 問題  negative 
##  3 問題  negative 
##  4 事故  negative 
##  5 事故  negative 
##  6 問題  negative 
##  7 安全  positive 
##  8 八卦  negative 
##  9 事故  negative 
## 10 嚴重  negative 
## # ... with 178 more rows
sentiment_count = rd_tokens_by_date %>%
  select(artDate,word,n) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(n))
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
sentiment_count %>%
  ggplot() +
  geom_line(aes(x=artDate,y=count,colour=sentiment)) +

  scale_x_date(labels = date_format("%m/%d"))  

rd_tokens_all <- rd2 %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>% 
  select(-artTime)
rd_tokens_all %>% 
  filter(artDate == as.Date("2021-04-02") |
         artDate == as.Date("2021-04-03") | 
         artDate == as.Date("2021-04-04") | 
         artDate == as.Date("2021-04-05") |
         artDate == as.Date("2021-04-06") |
         artDate == as.Date("2021-04-07") |
         artDate == as.Date("2021-04-08") ) %>% 
  inner_join(LIWC) %>% 
  filter(sentiment == "positive") %>% 
  group_by(artUrl,sentiment) %>% 
  summarise(
    artTitle = artTitle[1],
    count = n()
  ) %>% 
  arrange(desc(count))
## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
## # A tibble: 897 x 4
## # Groups:   artUrl [897]
##    artUrl                            sentiment artTitle                    count
##    <chr>                             <chr>     <chr>                       <int>
##  1 https://www.ptt.cc/bbs/Gossiping~ positive  [新聞]台鐵體檢報告曝光,近7年出軌事件「零改~    86
##  2 https://www.ptt.cc/bbs/Gossiping~ positive  Re:[問卦]為什麼台鐵要民營化啊?~    85
##  3 https://www.ptt.cc/bbs/Gossiping~ positive  Re:[新聞]抓到了!李義祥今年2月早被判關半年 高~    75
##  4 https://www.ptt.cc/bbs/Gossiping~ positive  [新聞]【後續調查】台鐵工地與安全管理長期鬆~    64
##  5 https://www.ptt.cc/bbs/Gossiping~ positive  [新聞]防災專家:台鐵要轉骨須府院出手~    31
##  6 https://www.ptt.cc/bbs/Gossiping~ positive  [新聞]台鐵出軌》蔡英文:改革台鐵勢在必行,~    30
##  7 https://www.ptt.cc/bbs/Gossiping~ positive  Re:[問卦]台鐵不趕快民營化是在幹嘛???????~    26
##  8 https://www.ptt.cc/bbs/Gossiping~ positive  [問卦]為什麼台鐵要民營化啊?~    25
##  9 https://www.ptt.cc/bbs/Gossiping~ positive  Re:[問卦]明明就工程車駕駛的鍋在無限上綱什麼?~    24
## 10 https://www.ptt.cc/bbs/Gossiping~ positive  [新聞]台鐵總體檢報告卡「賴下蘇上」遲未核定 ~    23
## # ... with 887 more rows
rd_tokens_all %>% 
  filter(artDate == as.Date("2021-04-02") |
         artDate == as.Date("2021-04-03") | 
         artDate == as.Date("2021-04-04") | 
         artDate == as.Date("2021-04-05") |
         artDate == as.Date("2021-04-06") |
         artDate == as.Date("2021-04-07") |
         artDate == as.Date("2021-04-08") ) %>% 
  inner_join(LIWC) %>% 
  filter(sentiment == "negative") %>% 
  group_by(artUrl,sentiment) %>% 
  summarise(
    artTitle = artTitle[1],
    count = n()
  ) %>% 
  arrange(desc(count))
## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
## # A tibble: 998 x 4
## # Groups:   artUrl [998]
##    artUrl                            sentiment artTitle                    count
##    <chr>                             <chr>     <chr>                       <int>
##  1 https://www.ptt.cc/bbs/Gossiping~ negative  [新聞]台鐵體檢報告曝光,近7年出軌事件「零改~    99
##  2 https://www.ptt.cc/bbs/Gossiping~ negative  Re:[新聞]抓到了!李義祥今年2月早被判關半年 高~    92
##  3 https://www.ptt.cc/bbs/Gossiping~ negative  [新聞]【後續調查】台鐵工地與安全管理長期鬆~    70
##  4 https://www.ptt.cc/bbs/Gossiping~ negative  Re:[問卦]為什麼台鐵要民營化啊?~    63
##  5 https://www.ptt.cc/bbs/Gossiping~ negative  [問卦]為什麼台鐵要民營化啊?~    53
##  6 https://www.ptt.cc/bbs/Gossiping~ negative  [新聞]防災專家:台鐵要轉骨須府院出手~    37
##  7 https://www.ptt.cc/bbs/Gossiping~ negative  [新聞]【台鐵出軌】藍營要求成立事故調閱委員會~    31
##  8 https://www.ptt.cc/bbs/Gossiping~ negative  Re:[新聞]台鐵出軌日媒:暴露台灣基礎設施缺陷~    29
##  9 https://www.ptt.cc/bbs/Gossiping~ negative  [新聞]台鐵出軌》蔡英文:改革台鐵勢在必行,~    29
## 10 https://www.ptt.cc/bbs/Gossiping~ negative  [新聞]台鐵出軌》綠委喊話:林佳龍有種留下把~    26
## # ... with 988 more rows
rd_tokens_all %>%
  filter(artDate == as.Date("2021-04-02") |
         artDate == as.Date("2021-04-03") | 
         artDate == as.Date("2021-04-04") | 
         artDate == as.Date("2021-04-05") |
         artDate == as.Date("2021-04-06") |
         artDate == as.Date("2021-04-07") |
         artDate == as.Date("2021-04-08") ) %>% 
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}
g_ngram_11 <- rd2 %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngrams_11_separated <- g_ngram_11 %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
head(g_ngrams_11_separated)
## # A tibble: 6 x 12
##   artUrl     word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
##   <chr>      <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>  <chr> 
## 1 https://w~ 今年  首波  玩笑  竟然  是    台鐵整~ 設好  鬧鐘  等    著     搶    
## 2 https://w~ 獨家  最美  區間車~ 啟航  小    英    來    開箱  台    鐵     金牌  
## 3 https://w~ 最美  區間車~ 啟航  小    英    來    開箱  台    鐵    金牌   運轉  
## 4 https://w~ 區間車~ 啟航  小    英    來    開箱  台    鐵    金牌  運轉   手    
## 5 https://w~ 啟航  小    英    來    開箱  台    鐵    金牌  運轉  手     護送  
## 6 https://w~ 小    英    來    開箱  台    鐵    金牌  運轉  手    護送    
g_check_words <- g_ngrams_11_separated %>%
  filter((word6 == "太魯閣"))
g_check_words
## # A tibble: 305 x 12
##    artUrl    word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
##    <chr>     <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>  <chr> 
##  1 https://~ 春節  疏運  他    駕駛  的    太魯閣~ 號    遇上  電車  線     故障  
##  2 https://~ 連假  第一天~ 台鐵今~ 上午  驚傳  太魯閣~ 號在  花蓮  大    清水   隧道  
##  3 https://~ 連假台~ 鐵    工程  全    暫停  太魯閣~ 號卻  撞    上    工程車 出軌  
##  4 https://~ 出動  調    查    台鐵局~ 表示  太魯閣~ 號是  撞    上台  鐵局   的    
##  5 https://~ 這麼  急好  ㄇ    台鐵局~ 表示  太魯閣~ 號是  撞    上台  鐵局   的    
##  6 https://~ 鐵    局局長~ 請辭  這次  花蓮  太魯閣~ 出軌  會    由誰來~ 負責   司機  
##  7 https://~ 火車撞~ 上    而且  那    班次  太魯閣~ 應該  不是  第一輛~ 通過   該處  
##  8 https://~ 是從  樹林  前往  台東  的    太魯閣~ 號    全車  搭車  旅客   約    
##  9 https://~ 緊急  成立  一級  應變  小組  太魯閣~ 撞    工程車~ 出軌  圖     記者  
## 10 https://~ 彥傑  翻攝  原文  網址  快訊  太魯閣~ 出軌  車頭  照片  曝光        
## # ... with 295 more rows
g_check_words_count <- g_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
g_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("出現在「太魯閣」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text())

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}
g_ngram_11 <- rd2 %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngrams_11_separated <- g_ngram_11 %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
g_ngrams_11_separated
## # A tibble: 170,303 x 12
##    artUrl    word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
##    <chr>     <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>  <chr> 
##  1 https://~ 今年  首波  玩笑  竟然  是    台鐵整~ 設好  鬧鐘  等    著     搶    
##  2 https://~ 獨家  最美  區間車~ 啟航  小    英    來    開箱  台    鐵     金牌  
##  3 https://~ 最美  區間車~ 啟航  小    英    來    開箱  台    鐵    金牌   運轉  
##  4 https://~ 區間車~ 啟航  小    英    來    開箱  台    鐵    金牌  運轉   手    
##  5 https://~ 啟航  小    英    來    開箱  台    鐵    金牌  運轉  手     護送  
##  6 https://~ 小    英    來    開箱  台    鐵    金牌  運轉  手    護送        
##  7 https://~ 英    來    開箱  台    鐵    金牌  運轉  手    護送        親    
##  8 https://~ 來    開箱  台    鐵    金牌  運轉  手    護送       親     曝    
##  9 https://~ 開箱  台    鐵    金牌  運轉  手    護送       親    曝     新車  
## 10 https://~ 台    鐵    金牌  運轉  手    護送       親    曝    新車   這裡  
## # ... with 170,293 more rows
g_check_words <- g_ngrams_11_separated %>%
  filter((word6 == "台鐵"))
g_check_words
## # A tibble: 1,661 x 12
##    artUrl    word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
##    <chr>     <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>  <chr> 
##  1 https://~ 到    讓    他    能    脫穎而出~ 台鐵  內部  對    他    評價   則是  
##  2 https://~ 大學  畢業  後    就    鎖定  台鐵  報考  特考  期間  先到   日商  
##  3 https://~ 分鐘  後    再    啟程  而    台鐵  規定  對號  列車  延誤   逾    
##  4 https://~ 全額  退    費    為    讓    台鐵  不    虧    錢    他     在    
##  5 https://~ 了    那麼  多    努力  改革  台鐵  讓    台    鐵    煥然一新~ 你    
##  6 https://~ 如果  今天  部長  不是  林佳龍~ 台鐵  恐怕  無法  避免  發生   更    
##  7 https://~ 工程車~ 翻覆  在    邊坡  旁    台鐵  表示  目前  路線  暫以   單線  
##  8 https://~ 就    發生  這種  事故  到底  台鐵  怎麼  了    以後  誰     敢    
##  9 https://~ 跡象  記者  李宜秦~ 台北  報導  台鐵  驚傳  出軌  事故  交通部 台鐵局
## 10 https://~ 台    鐵    事故  不算  少    台鐵  工會  也    常    抱怨   既然  
## # ... with 1,651 more rows
g_check_words_count <- g_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
g_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("出現在「台鐵」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text())

# 進行斷詞,並計算各詞彙在各文章中出現的次數
rd_words <- rd2 %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
head(rd_words)
## # A tibble: 6 x 3
##   artUrl                                                   word      n
##   <chr>                                                    <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 廠商     95
## 2 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 採購     55
## 3 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 台鐵     44
## 4 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 體檢     39
## 5 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 列車     38
## 6 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 軌道     36
# 計算每篇文章包含的詞數
total_words <- rd_words %>% 
  group_by(artUrl) %>% 
  summarize(total = sum(n))
head(total_words)
## # A tibble: 6 x 2
##   artUrl                                                   total
##   <chr>                                                    <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1617207223.A.020.html    16
## 2 https://www.ptt.cc/bbs/Gossiping/M.1617249914.A.8DA.html   465
## 3 https://www.ptt.cc/bbs/Gossiping/M.1617329020.A.9C1.html    18
## 4 https://www.ptt.cc/bbs/Gossiping/M.1617329416.A.0EA.html    29
## 5 https://www.ptt.cc/bbs/Gossiping/M.1617330379.A.DDD.html    22
## 6 https://www.ptt.cc/bbs/Gossiping/M.1617331302.A.E6D.html   189
# 合併 mask_words(每個詞彙在每個文章中出現的次數)
# 與 total_words(每篇文章的詞數)
# 新增各個詞彙在所有詞彙中的總數欄位
rd_words <- left_join(rd_words, total_words)
## Joining, by = "artUrl"
head(rd_words)
## # A tibble: 6 x 4
##   artUrl                                                   word      n total
##   <chr>                                                    <chr> <int> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 廠商     95  2912
## 2 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 採購     55  2912
## 3 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 台鐵     44  2773
## 4 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 體檢     39  2773
## 5 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 列車     38  2773
## 6 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 軌道     36  2773
# 以每篇文章爲單位,計算每個詞彙在的tf-idf值
rd_words_tf_idf <- rd_words %>%
  bind_tf_idf(word, artUrl, n)
#rd_words_tf_idf
# 選出每篇文章,tf-idf最大的十個詞
rd_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(artUrl))
## Selecting by tf_idf
## # A tibble: 16,458 x 7
## # Groups:   artUrl [1,485]
##    artUrl                                 word       n total     tf   idf tf_idf
##    <chr>                                  <chr>  <int> <int>  <dbl> <dbl>  <dbl>
##  1 https://www.ptt.cc/bbs/Gossiping/M.16~ 李義祥     3    35 0.0857  1.84  0.158
##  2 https://www.ptt.cc/bbs/Gossiping/M.16~ 醜化       2    35 0.0571  7.30  0.417
##  3 https://www.ptt.cc/bbs/Gossiping/M.16~ 人格       1    35 0.0286  6.20  0.177
##  4 https://www.ptt.cc/bbs/Gossiping/M.16~ 利害關係~     1    35 0.0286  6.61  0.189
##  5 https://www.ptt.cc/bbs/Gossiping/M.16~ 害人       1    35 0.0286  6.61  0.189
##  6 https://www.ptt.cc/bbs/Gossiping/M.16~ 追殺       1    35 0.0286  6.61  0.189
##  7 https://www.ptt.cc/bbs/Gossiping/M.16~ 揣測       1    35 0.0286  6.20  0.177
##  8 https://www.ptt.cc/bbs/Gossiping/M.16~ 無數       1    35 0.0286  5.69  0.163
##  9 https://www.ptt.cc/bbs/Gossiping/M.16~ 毀滅       1    35 0.0286  6.61  0.189
## 10 https://www.ptt.cc/bbs/Gossiping/M.16~ 網軍的     1    35 0.0286  6.20  0.177
## # ... with 16,448 more rows
# 選每篇文章,tf-idf最大的十個詞,
# 並查看每個詞被選中的次數
rd_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(artUrl)) %>%
  ungroup() %>%
  count(word, sort=TRUE)
## Selecting by tf_idf
## # A tibble: 10,782 x 2
##    word       n
##    <chr>  <int>
##  1 李義祥    46
##  2 民營化    43
##  3 改革      40
##  4 公司化    33
##  5 民進黨    31
##  6 高鐵      27
##  7 工程      26
##  8 工地      23
##  9 工程車    23
## 10 捐款      22
## # ... with 10,772 more rows
# 使用結巴斷詞,並搭配NLP packages中的 ngrams function
# e.g.
tokens <- segment("明天記得吃飯", jieba_tokenizer)
tokens
## [1] "明天" "記得" "吃飯"
bigram <- ngrams(tokens, 2)
bigram
## [[1]]
## [1] "明天" "記得"
## 
## [[2]]
## [1] "記得" "吃飯"
# Combine each bigrams into a single string, with the " " as the seperater.
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
## [1] "明天 記得" "記得 吃飯"
# remove stopwords
jieba_tokenizer = worker()

# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

jieba_bigram(c("明天記得吃飯", "早上準時起來"))
## [[1]]
## [1] "明天 記得" "記得 吃飯"
## 
## [[2]]
## [1] "早上 準時" "準時 起來"
# 執行bigram分詞
rd_bigram <- rd2 %>%
  unnest_tokens(bigram, sentence, token = jieba_bigram)
#rd_bigram
# 清楚包含英文或數字的bigram組合
# 計算每個組合出現的次數
rd_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  count(bigram, sort = TRUE) %>% head
## # A tibble: 6 x 2
##   bigram        n
##   <chr>     <int>
## 1 台 鐵       784
## 2 太魯閣 號   425
## 3 完整 新聞   304
## 4 都 是       281
## 5 也 是       235
## 6 的 人       221
# unnest_tokens 使用的ngram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_trigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      ngram<- ngrams(unlist(tokens), 3)
      ngram <- lapply(ngram, paste, collapse = " ")
      unlist(ngram)
    }
  })
}

jieba_trigram(c("明天記得吃飯", "早上準時起來"))
## [[1]]
## [1] "明天 記得 吃飯"
## 
## [[2]]
## [1] "早上 準時 起來"
# 執行ngram分詞
rd_trigram <- rd2 %>%
  unnest_tokens(ngrams, sentence, token = jieba_trigram)
rd_trigram %>%
  filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
  count(ngrams, sort = TRUE) %>% head
## # A tibble: 6 x 2
##   ngrams             n
##   <chr>          <int>
## 1 完整 新聞 內文   152
## 2 完整 新聞 連結   152
## 3 或 短 網址       150
## 4 連結 或 短       150
## 5 新聞 連結 或     150
## 6 台鐵 太魯閣 號   115
# load stop words
stop_words <- scan(file = "dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "dict/stop_words.txt", what = character(), sep = "\n", :
## 輸入連結 'dict/stop_words.txt' 中的輸入不正確
# remove the stop words in bigram
rd_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  count(word1, word2, sort = TRUE) %>%
  unite_("bigram", c("word1","word2"), sep=" ")
## # A tibble: 87,018 x 2
##    bigram            n
##    <chr>         <int>
##  1 台 鐵           784
##  2 太魯閣 號       425
##  3 完整 新聞       304
##  4 蔡 英文         170
##  5 記者 署名       158
##  6 媒體 來源       156
##  7 完整 新聞標題   154
##  8 新聞 連結       154
##  9 新聞 內文       152
## 10 短 網址         151
## # ... with 87,008 more rows
# remove the stop words in trigram
rd_trigram %>%
  filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
  separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
  count(word1, word2, word3, sort = TRUE) %>%
  unite_("ngrams", c("word1", "word2", "word3"), sep=" ")
## # A tibble: 78,185 x 2
##    ngrams                 n
##    <chr>              <int>
##  1 完整 新聞 內文       152
##  2 完整 新聞 連結       152
##  3 台鐵 太魯閣 號       115
##  4 太魯閣 號 事故        75
##  5 太魯閣 號 出軌        66
##  6 總體 檢 報告          59
##  7 新聞 內文 台鐵        56
##  8 李 義 祥              53
##  9 完整 新聞標題 台鐵    44
## 10 次 太魯閣 號          43
## # ... with 78,175 more rows
# load lexicon
rd_lexicon <- scan(file = "dict/rd_lexicon.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
rd_lexicon
##  [1] "太魯閣號"         "台鐵\t\t\t\t"     "蔡英文\t"         "太魯閣\t\t\t\t"  
##  [5] "工地"             "主任\t\t"         "清水隧道"         "隧道\t\t"        
##  [9] "撞上\t\t\t"       "義祥\t\t\t\t\t"   "工業\t\t\t"       "事故\t\t\t\t"    
## [13] "採購法\t\t\t"     "工程車"           "滑落\t\t"         "出軌\t\t\t\t\t\t"
## [17] "改革\t\t\t\t"     "行車安全"         "安全\t\t\t"       "報告\t\t\t"      
## [21] "死亡\t\t\t\t\t\t" "負責人 "          "李義祥"           "政府"            
## [25] "事故"             "交通部長"         "林佳龍"           "台鐵事故"        
## [29] "台鐵民營化"       "民營化"           "花蓮縣黨代表"     "台鐵員工"        
## [33] "帶風向"           "過失"             "致死"             "罹難者"          
## [37] "家屬"             "情節重大"         "普悠瑪事故"       "公共工程"        
## [41] "滑落"             "邊坡"             "受傷"             "輕傷"            
## [45] "重傷"             "人輕重傷"         "司機"             "事故原因"        
## [49] "東新營造"         "發生事故"         "監造單位"         "隧道內"          
## [53] "負責人"           "義程營造"         "萬元交保"         "不良廠商"        
## [57] "安全防護"         "地檢署"           "改善"             "鐵道"            
## [61] "李義祥"           "蔡英文"           "台鐵員工"         "台鐵局"          
## [65] "重大"             "民進黨"           "外役監"           "調查"            
## [69] "紀錄"             "釐清事故原因\t"   "釐清"             "委員會"          
## [73] "管理局"           "故障"             "家屬"             "罹難者"          
## [77] "工會"             "工地"             "營造"             "工程車"
jieba_tokenizer = worker()

# 使用疫情相關字典重新斷詞
# 把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(rd_lexicon))
## [1] TRUE
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      #tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
# 計算兩個詞彙同時出現的總次數
word_pairs <- rd_words %>%
  pairwise_count(word, artUrl, sort = TRUE)
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
#word_pairs
# 計算兩個詞彙間的相關性
word_cors <- rd_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

#word_cors
# 與太魯閣相關性高的詞彙
word_cors %>%
  filter(item1 == "太魯閣") %>% 
  head(5)
## # A tibble: 5 x 3
##   item1  item2    correlation
##   <chr>  <chr>          <dbl>
## 1 太魯閣 記者           0.626
## 2 太魯閣 出軌           0.611
## 3 太魯閣 備註           0.539
## 4 太魯閣 新聞標題       0.536
## 5 太魯閣 媒體           0.532
# 與義祥相關性高的詞彙
word_cors %>%
  filter(item1 == "義祥") %>% 
  head(5)
## # A tibble: 5 x 3
##   item1 item2  correlation
##   <chr> <chr>        <dbl>
## 1 義祥  工業         0.603
## 2 義祥  負責人       0.504
## 3 義祥  李義祥       0.291
## 4 義祥  營造         0.283
## 5 義祥  地院         0.251
# 與台鐵相關性高的詞彙
word_cors %>%
  filter(item1 == "台鐵") %>% 
  head(5)
## # A tibble: 5 x 3
##   item1 item2 correlation
##   <chr> <chr>       <dbl>
## 1 台鐵  記者        0.179
## 2 台鐵  報導        0.171
## 3 台鐵  表示        0.170
## 4 台鐵  署名        0.161
## 5 台鐵  改革        0.160
# 分別尋找與 "台鐵", "太魯閣", "政府", "義祥" 相關性最高的 10 個詞彙
word_cors %>%
  filter(item1 %in% c("台鐵", "太魯閣", "政府", "義祥")) %>%
  group_by(item1) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()+ 
  theme(text = element_text())
## Selecting by correlation

# 顯示相關性大於0.4的組合
set.seed(2020)

word_cors %>%
  filter(correlation > 0.4) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE) + #加入中文字型設定,避免中文字顯示錯誤。
  theme_void()
## Warning: ggrepel: 36 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# 顯示相關性大於0.5的組合
set.seed(2020)

word_cors %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()
## Warning: ggrepel: 24 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# 設定幾個詞做爲seed words
seed_words <- c("完整", "作者", "記者", "來源", "媒體", "短", "這件事")
# 設定threshold爲0.5
threshold <- 0.6
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
                filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
                .$item1 %>%
                unique()
#remove_words
# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
                filter(!(item1 %in% remove_words|item2 %in% remove_words))

word_cors_new %>%
  filter(correlation > 0.4) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps