Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("readr", "dplyr","wordcloud2", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph","reshape2", "NLP","scales", "reshape2", "widyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(readr)
## Loading required package: readr
require(dplyr)
require(stringr)
## Loading required package: stringr
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(tidytext)
## Loading required package: tidytext
require(NLP)
## Loading required package: NLP
require(tidyr)
## Loading required package: tidyr
require(ggplot2)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
require(ggraph)
## Loading required package: ggraph
require(igraph)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
require(scales)
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
require(reshape2)
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
require(widyr)
## Loading required package: widyr
require(knitr)
## Loading required package: knitr
library(data.table)
require(ropencc)
## Loading required package: ropencc
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'ropencc'
require(wordcloud2)
## Loading required package: wordcloud2
require(lubridate)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:igraph':
##
## %--%, union
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
require(htmlwidgets)
## Loading required package: htmlwidgets
require(webshot)
## Loading required package: webshot
require(plotly)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
require( RColorBrewer)
## Loading required package: RColorBrewer
require(servr)
## Loading required package: servr
require(tm)
## Loading required package: tm
require(data.table)
require(stringr)
library(topicmodels)
require(LDAvis)
## Loading required package: LDAvis
require(webshot)
rd <- read_csv("太魯閣事件_artWordFreq.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## word = col_character(),
## count = col_double()
## )
rd2 <- read_csv("太魯閣事件_articleMetaData.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
rd$artDate <- rd$artDate %>% as.Date("%Y/%m/%d")
rd
## # A tibble: 100,230 x 6
## artTitle artDate artTime artUrl word count
## <chr> <date> <time> <chr> <chr> <dbl>
## 1 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 沒開 3
## 2 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 今年 1
## 3 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 首波 1
## 4 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 玩笑 1
## 5 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 竟然 1
## 6 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 台鐵整~ 1
## 7 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 設好 1
## 8 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 鬧鐘 1
## 9 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 車票 1
## 10 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 日期 1
## # ... with 100,220 more rows
jieba_tokenizer = worker()
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
data <- rd %>%
dplyr::select(artDate, artUrl) %>%
distinct()
article_count_by_date <- data %>%
group_by(artDate) %>%
summarise(count = n())
head(article_count_by_date, 20)
## # A tibble: 15 x 2
## artDate count
## <date> <int>
## 1 2021-03-31 1
## 2 2021-04-01 1
## 3 2021-04-02 354
## 4 2021-04-03 205
## 5 2021-04-04 215
## 6 2021-04-05 121
## 7 2021-04-06 134
## 8 2021-04-07 200
## 9 2021-04-08 79
## 10 2021-04-09 80
## 11 2021-04-10 35
## 12 2021-04-11 17
## 13 2021-04-12 18
## 14 2021-04-13 20
## 15 2021-04-14 5
plot_date <-
# data
article_count_by_date %>%
# aesthetics
ggplot(aes(x = artDate, y = count)) +
# geometrics
geom_line(color = "#00AFBB", size = 1) +
# coordinates
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("PTT 八卦版 討論文章數") +
xlab("日期") +
ylab("數量") +
# theme
theme() #加入中文字型設定,避免中文字顯示錯誤。
plot_date

data <- rd %>%
group_by(word) %>%
summarise(sum = sum(count), .groups = 'drop') %>%
arrange(desc(sum))
head(data)
## # A tibble: 6 x 2
## word sum
## <chr> <dbl>
## 1 台鐵 2600
## 2 事故 792
## 3 問題 759
## 4 工程 715
## 5 工程車 715
## 6 太魯閣 713
data %>% filter(sum > 50) %>% wordcloud2()
rd_tokens <- rd %>%
select(-artTime, -artUrl)
head(rd_tokens)
## # A tibble: 6 x 4
## artTitle artDate word count
## <chr> <date> <chr> <dbl>
## 1 [問卦]台鐵是在愚人節? 2021-03-31 沒開 3
## 2 [問卦]台鐵是在愚人節? 2021-03-31 今年 1
## 3 [問卦]台鐵是在愚人節? 2021-03-31 首波 1
## 4 [問卦]台鐵是在愚人節? 2021-03-31 玩笑 1
## 5 [問卦]台鐵是在愚人節? 2021-03-31 竟然 1
## 6 [問卦]台鐵是在愚人節? 2021-03-31 台鐵整 1
rd_tokens_by_date <- rd_tokens %>%
count(artDate, word, sort = TRUE) %>%
filter(n > 5)
plot_merge <- rd_tokens_by_date %>%
filter(artDate == as.Date("2021-04-02") |
artDate == as.Date("2021-04-03") |
artDate == as.Date("2021-04-04") |
artDate == as.Date("2021-04-05") |
artDate == as.Date("2021-04-06") |
artDate == as.Date("2021-04-07") |
artDate == as.Date("2021-04-08") |
artDate == as.Date("2021-04-09") |
artDate == as.Date("2021-04-10")) %>%
group_by(artDate) %>%
top_n(7, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n, fill = artDate)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = NULL) +
facet_wrap(~artDate, scales="free", ncol = 5) +
coord_flip()+
theme(text = element_text())
plot_merge

# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/positive.txt")
# 負向字典txt檔
N <- read_file("dict/negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
rd_tokens_by_date %>%
inner_join(LIWC) %>%
select(word) %>%
inner_join(LIWC)
## Joining, by = "word"
## Joining, by = "word"
## # A tibble: 188 x 2
## word sentiment
## <chr> <chr>
## 1 事故 negative
## 2 問題 negative
## 3 問題 negative
## 4 事故 negative
## 5 事故 negative
## 6 問題 negative
## 7 安全 positive
## 8 八卦 negative
## 9 事故 negative
## 10 嚴重 negative
## # ... with 178 more rows
sentiment_count = rd_tokens_by_date %>%
select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
sentiment_count %>%
ggplot() +
geom_line(aes(x=artDate,y=count,colour=sentiment)) +
scale_x_date(labels = date_format("%m/%d"))

rd_tokens_all <- rd2 %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
rd_tokens_all %>%
filter(artDate == as.Date("2021-04-02") |
artDate == as.Date("2021-04-03") |
artDate == as.Date("2021-04-04") |
artDate == as.Date("2021-04-05") |
artDate == as.Date("2021-04-06") |
artDate == as.Date("2021-04-07") |
artDate == as.Date("2021-04-08") ) %>%
inner_join(LIWC) %>%
filter(sentiment == "positive") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
## # A tibble: 897 x 4
## # Groups: artUrl [897]
## artUrl sentiment artTitle count
## <chr> <chr> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping~ positive [新聞]台鐵體檢報告曝光,近7年出軌事件「零改~ 86
## 2 https://www.ptt.cc/bbs/Gossiping~ positive Re:[問卦]為什麼台鐵要民營化啊?~ 85
## 3 https://www.ptt.cc/bbs/Gossiping~ positive Re:[新聞]抓到了!李義祥今年2月早被判關半年 高~ 75
## 4 https://www.ptt.cc/bbs/Gossiping~ positive [新聞]【後續調查】台鐵工地與安全管理長期鬆~ 64
## 5 https://www.ptt.cc/bbs/Gossiping~ positive [新聞]防災專家:台鐵要轉骨須府院出手~ 31
## 6 https://www.ptt.cc/bbs/Gossiping~ positive [新聞]台鐵出軌》蔡英文:改革台鐵勢在必行,~ 30
## 7 https://www.ptt.cc/bbs/Gossiping~ positive Re:[問卦]台鐵不趕快民營化是在幹嘛???????~ 26
## 8 https://www.ptt.cc/bbs/Gossiping~ positive [問卦]為什麼台鐵要民營化啊?~ 25
## 9 https://www.ptt.cc/bbs/Gossiping~ positive Re:[問卦]明明就工程車駕駛的鍋在無限上綱什麼?~ 24
## 10 https://www.ptt.cc/bbs/Gossiping~ positive [新聞]台鐵總體檢報告卡「賴下蘇上」遲未核定 ~ 23
## # ... with 887 more rows
rd_tokens_all %>%
filter(artDate == as.Date("2021-04-02") |
artDate == as.Date("2021-04-03") |
artDate == as.Date("2021-04-04") |
artDate == as.Date("2021-04-05") |
artDate == as.Date("2021-04-06") |
artDate == as.Date("2021-04-07") |
artDate == as.Date("2021-04-08") ) %>%
inner_join(LIWC) %>%
filter(sentiment == "negative") %>%
group_by(artUrl,sentiment) %>%
summarise(
artTitle = artTitle[1],
count = n()
) %>%
arrange(desc(count))
## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
## # A tibble: 998 x 4
## # Groups: artUrl [998]
## artUrl sentiment artTitle count
## <chr> <chr> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]台鐵體檢報告曝光,近7年出軌事件「零改~ 99
## 2 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]抓到了!李義祥今年2月早被判關半年 高~ 92
## 3 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]【後續調查】台鐵工地與安全管理長期鬆~ 70
## 4 https://www.ptt.cc/bbs/Gossiping~ negative Re:[問卦]為什麼台鐵要民營化啊?~ 63
## 5 https://www.ptt.cc/bbs/Gossiping~ negative [問卦]為什麼台鐵要民營化啊?~ 53
## 6 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]防災專家:台鐵要轉骨須府院出手~ 37
## 7 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]【台鐵出軌】藍營要求成立事故調閱委員會~ 31
## 8 https://www.ptt.cc/bbs/Gossiping~ negative Re:[新聞]台鐵出軌日媒:暴露台灣基礎設施缺陷~ 29
## 9 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]台鐵出軌》蔡英文:改革台鐵勢在必行,~ 29
## 10 https://www.ptt.cc/bbs/Gossiping~ negative [新聞]台鐵出軌》綠委喊話:林佳龍有種留下把~ 26
## # ... with 988 more rows
rd_tokens_all %>%
filter(artDate == as.Date("2021-04-02") |
artDate == as.Date("2021-04-03") |
artDate == as.Date("2021-04-04") |
artDate == as.Date("2021-04-05") |
artDate == as.Date("2021-04-06") |
artDate == as.Date("2021-04-07") |
artDate == as.Date("2021-04-08") ) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
g_ngram_11 <- rd2 %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngrams_11_separated <- g_ngram_11 %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
head(g_ngrams_11_separated)
## # A tibble: 6 x 12
## artUrl word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 https://w~ 今年 首波 玩笑 竟然 是 台鐵整~ 設好 鬧鐘 等 著 搶
## 2 https://w~ 獨家 最美 區間車~ 啟航 小 英 來 開箱 台 鐵 金牌
## 3 https://w~ 最美 區間車~ 啟航 小 英 來 開箱 台 鐵 金牌 運轉
## 4 https://w~ 區間車~ 啟航 小 英 來 開箱 台 鐵 金牌 運轉 手
## 5 https://w~ 啟航 小 英 來 開箱 台 鐵 金牌 運轉 手 護送
## 6 https://w~ 小 英 來 開箱 台 鐵 金牌 運轉 手 護送
g_check_words <- g_ngrams_11_separated %>%
filter((word6 == "太魯閣"))
g_check_words
## # A tibble: 305 x 12
## artUrl word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 https://~ 春節 疏運 他 駕駛 的 太魯閣~ 號 遇上 電車 線 故障
## 2 https://~ 連假 第一天~ 台鐵今~ 上午 驚傳 太魯閣~ 號在 花蓮 大 清水 隧道
## 3 https://~ 連假台~ 鐵 工程 全 暫停 太魯閣~ 號卻 撞 上 工程車 出軌
## 4 https://~ 出動 調 查 台鐵局~ 表示 太魯閣~ 號是 撞 上台 鐵局 的
## 5 https://~ 這麼 急好 ㄇ 台鐵局~ 表示 太魯閣~ 號是 撞 上台 鐵局 的
## 6 https://~ 鐵 局局長~ 請辭 這次 花蓮 太魯閣~ 出軌 會 由誰來~ 負責 司機
## 7 https://~ 火車撞~ 上 而且 那 班次 太魯閣~ 應該 不是 第一輛~ 通過 該處
## 8 https://~ 是從 樹林 前往 台東 的 太魯閣~ 號 全車 搭車 旅客 約
## 9 https://~ 緊急 成立 一級 應變 小組 太魯閣~ 撞 工程車~ 出軌 圖 記者
## 10 https://~ 彥傑 翻攝 原文 網址 快訊 太魯閣~ 出軌 車頭 照片 曝光
## # ... with 295 more rows
g_check_words_count <- g_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
g_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("出現在「太魯閣」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text())

ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
g_ngram_11 <- rd2 %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngrams_11_separated <- g_ngram_11 %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
g_ngrams_11_separated
## # A tibble: 170,303 x 12
## artUrl word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 https://~ 今年 首波 玩笑 竟然 是 台鐵整~ 設好 鬧鐘 等 著 搶
## 2 https://~ 獨家 最美 區間車~ 啟航 小 英 來 開箱 台 鐵 金牌
## 3 https://~ 最美 區間車~ 啟航 小 英 來 開箱 台 鐵 金牌 運轉
## 4 https://~ 區間車~ 啟航 小 英 來 開箱 台 鐵 金牌 運轉 手
## 5 https://~ 啟航 小 英 來 開箱 台 鐵 金牌 運轉 手 護送
## 6 https://~ 小 英 來 開箱 台 鐵 金牌 運轉 手 護送
## 7 https://~ 英 來 開箱 台 鐵 金牌 運轉 手 護送 親
## 8 https://~ 來 開箱 台 鐵 金牌 運轉 手 護送 親 曝
## 9 https://~ 開箱 台 鐵 金牌 運轉 手 護送 親 曝 新車
## 10 https://~ 台 鐵 金牌 運轉 手 護送 親 曝 新車 這裡
## # ... with 170,293 more rows
g_check_words <- g_ngrams_11_separated %>%
filter((word6 == "台鐵"))
g_check_words
## # A tibble: 1,661 x 12
## artUrl word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 https://~ 到 讓 他 能 脫穎而出~ 台鐵 內部 對 他 評價 則是
## 2 https://~ 大學 畢業 後 就 鎖定 台鐵 報考 特考 期間 先到 日商
## 3 https://~ 分鐘 後 再 啟程 而 台鐵 規定 對號 列車 延誤 逾
## 4 https://~ 全額 退 費 為 讓 台鐵 不 虧 錢 他 在
## 5 https://~ 了 那麼 多 努力 改革 台鐵 讓 台 鐵 煥然一新~ 你
## 6 https://~ 如果 今天 部長 不是 林佳龍~ 台鐵 恐怕 無法 避免 發生 更
## 7 https://~ 工程車~ 翻覆 在 邊坡 旁 台鐵 表示 目前 路線 暫以 單線
## 8 https://~ 就 發生 這種 事故 到底 台鐵 怎麼 了 以後 誰 敢
## 9 https://~ 跡象 記者 李宜秦~ 台北 報導 台鐵 驚傳 出軌 事故 交通部 台鐵局
## 10 https://~ 台 鐵 事故 不算 少 台鐵 工會 也 常 抱怨 既然
## # ... with 1,651 more rows
g_check_words_count <- g_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
g_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("出現在「台鐵」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text())

# 進行斷詞,並計算各詞彙在各文章中出現的次數
rd_words <- rd2 %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
head(rd_words)
## # A tibble: 6 x 3
## artUrl word n
## <chr> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 廠商 95
## 2 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 採購 55
## 3 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 台鐵 44
## 4 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 體檢 39
## 5 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 列車 38
## 6 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 軌道 36
# 計算每篇文章包含的詞數
total_words <- rd_words %>%
group_by(artUrl) %>%
summarize(total = sum(n))
head(total_words)
## # A tibble: 6 x 2
## artUrl total
## <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1617207223.A.020.html 16
## 2 https://www.ptt.cc/bbs/Gossiping/M.1617249914.A.8DA.html 465
## 3 https://www.ptt.cc/bbs/Gossiping/M.1617329020.A.9C1.html 18
## 4 https://www.ptt.cc/bbs/Gossiping/M.1617329416.A.0EA.html 29
## 5 https://www.ptt.cc/bbs/Gossiping/M.1617330379.A.DDD.html 22
## 6 https://www.ptt.cc/bbs/Gossiping/M.1617331302.A.E6D.html 189
# 合併 mask_words(每個詞彙在每個文章中出現的次數)
# 與 total_words(每篇文章的詞數)
# 新增各個詞彙在所有詞彙中的總數欄位
rd_words <- left_join(rd_words, total_words)
## Joining, by = "artUrl"
head(rd_words)
## # A tibble: 6 x 4
## artUrl word n total
## <chr> <chr> <int> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 廠商 95 2912
## 2 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 採購 55 2912
## 3 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 台鐵 44 2773
## 4 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 體檢 39 2773
## 5 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 列車 38 2773
## 6 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 軌道 36 2773
# 以每篇文章爲單位,計算每個詞彙在的tf-idf值
rd_words_tf_idf <- rd_words %>%
bind_tf_idf(word, artUrl, n)
#rd_words_tf_idf
# 選出每篇文章,tf-idf最大的十個詞
rd_words_tf_idf %>%
group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl))
## Selecting by tf_idf
## # A tibble: 16,458 x 7
## # Groups: artUrl [1,485]
## artUrl word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 https://www.ptt.cc/bbs/Gossiping/M.16~ 李義祥 3 35 0.0857 1.84 0.158
## 2 https://www.ptt.cc/bbs/Gossiping/M.16~ 醜化 2 35 0.0571 7.30 0.417
## 3 https://www.ptt.cc/bbs/Gossiping/M.16~ 人格 1 35 0.0286 6.20 0.177
## 4 https://www.ptt.cc/bbs/Gossiping/M.16~ 利害關係~ 1 35 0.0286 6.61 0.189
## 5 https://www.ptt.cc/bbs/Gossiping/M.16~ 害人 1 35 0.0286 6.61 0.189
## 6 https://www.ptt.cc/bbs/Gossiping/M.16~ 追殺 1 35 0.0286 6.61 0.189
## 7 https://www.ptt.cc/bbs/Gossiping/M.16~ 揣測 1 35 0.0286 6.20 0.177
## 8 https://www.ptt.cc/bbs/Gossiping/M.16~ 無數 1 35 0.0286 5.69 0.163
## 9 https://www.ptt.cc/bbs/Gossiping/M.16~ 毀滅 1 35 0.0286 6.61 0.189
## 10 https://www.ptt.cc/bbs/Gossiping/M.16~ 網軍的 1 35 0.0286 6.20 0.177
## # ... with 16,448 more rows
# 選每篇文章,tf-idf最大的十個詞,
# 並查看每個詞被選中的次數
rd_words_tf_idf %>%
group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl)) %>%
ungroup() %>%
count(word, sort=TRUE)
## Selecting by tf_idf
## # A tibble: 10,782 x 2
## word n
## <chr> <int>
## 1 李義祥 46
## 2 民營化 43
## 3 改革 40
## 4 公司化 33
## 5 民進黨 31
## 6 高鐵 27
## 7 工程 26
## 8 工地 23
## 9 工程車 23
## 10 捐款 22
## # ... with 10,772 more rows
# 使用結巴斷詞,並搭配NLP packages中的 ngrams function
# e.g.
tokens <- segment("明天記得吃飯", jieba_tokenizer)
tokens
## [1] "明天" "記得" "吃飯"
bigram <- ngrams(tokens, 2)
bigram
## [[1]]
## [1] "明天" "記得"
##
## [[2]]
## [1] "記得" "吃飯"
# Combine each bigrams into a single string, with the " " as the seperater.
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
## [1] "明天 記得" "記得 吃飯"
# remove stopwords
jieba_tokenizer = worker()
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
jieba_bigram(c("明天記得吃飯", "早上準時起來"))
## [[1]]
## [1] "明天 記得" "記得 吃飯"
##
## [[2]]
## [1] "早上 準時" "準時 起來"
# 執行bigram分詞
rd_bigram <- rd2 %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
#rd_bigram
# 清楚包含英文或數字的bigram組合
# 計算每個組合出現的次數
rd_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
count(bigram, sort = TRUE) %>% head
## # A tibble: 6 x 2
## bigram n
## <chr> <int>
## 1 台 鐵 784
## 2 太魯閣 號 425
## 3 完整 新聞 304
## 4 都 是 281
## 5 也 是 235
## 6 的 人 221
# unnest_tokens 使用的ngram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_trigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
ngram<- ngrams(unlist(tokens), 3)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
}
})
}
jieba_trigram(c("明天記得吃飯", "早上準時起來"))
## [[1]]
## [1] "明天 記得 吃飯"
##
## [[2]]
## [1] "早上 準時 起來"
# 執行ngram分詞
rd_trigram <- rd2 %>%
unnest_tokens(ngrams, sentence, token = jieba_trigram)
rd_trigram %>%
filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
count(ngrams, sort = TRUE) %>% head
## # A tibble: 6 x 2
## ngrams n
## <chr> <int>
## 1 完整 新聞 內文 152
## 2 完整 新聞 連結 152
## 3 或 短 網址 150
## 4 連結 或 短 150
## 5 新聞 連結 或 150
## 6 台鐵 太魯閣 號 115
# load stop words
stop_words <- scan(file = "dict/stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "dict/stop_words.txt", what = character(), sep = "\n", :
## 輸入連結 'dict/stop_words.txt' 中的輸入不正確
# remove the stop words in bigram
rd_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
count(word1, word2, sort = TRUE) %>%
unite_("bigram", c("word1","word2"), sep=" ")
## # A tibble: 87,018 x 2
## bigram n
## <chr> <int>
## 1 台 鐵 784
## 2 太魯閣 號 425
## 3 完整 新聞 304
## 4 蔡 英文 170
## 5 記者 署名 158
## 6 媒體 來源 156
## 7 完整 新聞標題 154
## 8 新聞 連結 154
## 9 新聞 內文 152
## 10 短 網址 151
## # ... with 87,008 more rows
# remove the stop words in trigram
rd_trigram %>%
filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
count(word1, word2, word3, sort = TRUE) %>%
unite_("ngrams", c("word1", "word2", "word3"), sep=" ")
## # A tibble: 78,185 x 2
## ngrams n
## <chr> <int>
## 1 完整 新聞 內文 152
## 2 完整 新聞 連結 152
## 3 台鐵 太魯閣 號 115
## 4 太魯閣 號 事故 75
## 5 太魯閣 號 出軌 66
## 6 總體 檢 報告 59
## 7 新聞 內文 台鐵 56
## 8 李 義 祥 53
## 9 完整 新聞標題 台鐵 44
## 10 次 太魯閣 號 43
## # ... with 78,175 more rows
# load lexicon
rd_lexicon <- scan(file = "dict/rd_lexicon.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
rd_lexicon
## [1] "太魯閣號" "台鐵\t\t\t\t" "蔡英文\t" "太魯閣\t\t\t\t"
## [5] "工地" "主任\t\t" "清水隧道" "隧道\t\t"
## [9] "撞上\t\t\t" "義祥\t\t\t\t\t" "工業\t\t\t" "事故\t\t\t\t"
## [13] "採購法\t\t\t" "工程車" "滑落\t\t" "出軌\t\t\t\t\t\t"
## [17] "改革\t\t\t\t" "行車安全" "安全\t\t\t" "報告\t\t\t"
## [21] "死亡\t\t\t\t\t\t" "負責人 " "李義祥" "政府"
## [25] "事故" "交通部長" "林佳龍" "台鐵事故"
## [29] "台鐵民營化" "民營化" "花蓮縣黨代表" "台鐵員工"
## [33] "帶風向" "過失" "致死" "罹難者"
## [37] "家屬" "情節重大" "普悠瑪事故" "公共工程"
## [41] "滑落" "邊坡" "受傷" "輕傷"
## [45] "重傷" "人輕重傷" "司機" "事故原因"
## [49] "東新營造" "發生事故" "監造單位" "隧道內"
## [53] "負責人" "義程營造" "萬元交保" "不良廠商"
## [57] "安全防護" "地檢署" "改善" "鐵道"
## [61] "李義祥" "蔡英文" "台鐵員工" "台鐵局"
## [65] "重大" "民進黨" "外役監" "調查"
## [69] "紀錄" "釐清事故原因\t" "釐清" "委員會"
## [73] "管理局" "故障" "家屬" "罹難者"
## [77] "工會" "工地" "營造" "工程車"
jieba_tokenizer = worker()
# 使用疫情相關字典重新斷詞
# 把否定詞也加入斷詞
new_user_word(jieba_tokenizer, c(rd_lexicon))
## [1] TRUE
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1的詞彙
#tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 計算兩個詞彙同時出現的總次數
word_pairs <- rd_words %>%
pairwise_count(word, artUrl, sort = TRUE)
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
#word_pairs
# 計算兩個詞彙間的相關性
word_cors <- rd_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
#word_cors
# 與太魯閣相關性高的詞彙
word_cors %>%
filter(item1 == "太魯閣") %>%
head(5)
## # A tibble: 5 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 太魯閣 記者 0.626
## 2 太魯閣 出軌 0.611
## 3 太魯閣 備註 0.539
## 4 太魯閣 新聞標題 0.536
## 5 太魯閣 媒體 0.532
# 與義祥相關性高的詞彙
word_cors %>%
filter(item1 == "義祥") %>%
head(5)
## # A tibble: 5 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 義祥 工業 0.603
## 2 義祥 負責人 0.504
## 3 義祥 李義祥 0.291
## 4 義祥 營造 0.283
## 5 義祥 地院 0.251
# 與台鐵相關性高的詞彙
word_cors %>%
filter(item1 == "台鐵") %>%
head(5)
## # A tibble: 5 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 台鐵 記者 0.179
## 2 台鐵 報導 0.171
## 3 台鐵 表示 0.170
## 4 台鐵 署名 0.161
## 5 台鐵 改革 0.160
# 分別尋找與 "台鐵", "太魯閣", "政府", "義祥" 相關性最高的 10 個詞彙
word_cors %>%
filter(item1 %in% c("台鐵", "太魯閣", "政府", "義祥")) %>%
group_by(item1) %>%
top_n(10) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text())
## Selecting by correlation

# 顯示相關性大於0.4的組合
set.seed(2020)
word_cors %>%
filter(correlation > 0.4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
## Warning: ggrepel: 36 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# 顯示相關性大於0.5的組合
set.seed(2020)
word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
## Warning: ggrepel: 24 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# 設定幾個詞做爲seed words
seed_words <- c("完整", "作者", "記者", "來源", "媒體", "短", "這件事")
# 設定threshold爲0.5
threshold <- 0.6
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
.$item1 %>%
unique()
#remove_words
# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
filter(!(item1 %in% remove_words|item2 %in% remove_words))
word_cors_new %>%
filter(correlation > 0.4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
