分析PTT八卦版對太魯閣號出軌事件的文字資料
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
= c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr")
packages = as.character(installed.packages()[,1])
existing for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(readr)
## Loading required package: readr
require(dplyr)
require(stringr)
## Loading required package: stringr
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(tidytext)
## Loading required package: tidytext
require(NLP)
## Loading required package: NLP
require(tidyr)
## Loading required package: tidyr
require(ggplot2)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
require(ggraph)
## Loading required package: ggraph
require(igraph)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
require(scales)
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
require(reshape2)
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
require(widyr)
## Loading required package: widyr
library(data.table)
require(wordcloud2)
## Loading required package: wordcloud2
require(lubridate)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:igraph':
##
## %--%, union
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
require(htmlwidgets)
## Loading required package: htmlwidgets
require(webshot)
## Loading required package: webshot
require(plotly)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
require( RColorBrewer)
## Loading required package: RColorBrewer
require(servr)
## Loading required package: servr
require(tm)
## Loading required package: tm
require(data.table)
require(stringr)
library(topicmodels)
require(LDAvis)
## Loading required package: LDAvis
require(webshot)
<- read_csv("太魯閣事件_artWordFreq.csv") rd
##
## -- Column specification --------------------------------------------------------
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## word = col_character(),
## count = col_double()
## )
<- read_csv("太魯閣事件_articleMetaData.csv") rd2
##
## -- Column specification --------------------------------------------------------
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
$artDate <- rd$artDate %>% as.Date("%Y/%m/%d")
rd rd
## # A tibble: 100,230 x 6
## artTitle artDate artTime artUrl word count
## <chr> <date> <time> <chr> <chr> <dbl>
## 1 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 沒開 3
## 2 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 今年 1
## 3 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 首波 1
## 4 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 玩笑 1
## 5 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 竟然 1
## 6 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 台鐵整~ 1
## 7 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 設好 1
## 8 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 鬧鐘 1
## 9 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 車票 1
## 10 [問卦]台鐵是在愚人節?~ 2021-03-31 16:13:41 https://www.ptt.cc/bbs/Gossi~ 日期 1
## # ... with 100,220 more rows
<- rd %>%
data ::select(artDate, artUrl) %>%
dplyrdistinct()
<- data %>%
article_count_by_date group_by(artDate) %>%
summarise(count = n())
head(article_count_by_date, 20)
## # A tibble: 15 x 2
## artDate count
## <date> <int>
## 1 2021-03-31 1
## 2 2021-04-01 1
## 3 2021-04-02 354
## 4 2021-04-03 205
## 5 2021-04-04 215
## 6 2021-04-05 121
## 7 2021-04-06 134
## 8 2021-04-07 200
## 9 2021-04-08 79
## 10 2021-04-09 80
## 11 2021-04-10 35
## 12 2021-04-11 17
## 13 2021-04-12 18
## 14 2021-04-13 20
## 15 2021-04-14 5
<-
plot_date # data
%>%
article_count_by_date # aesthetics
ggplot(aes(x = artDate, y = count)) +
# geometrics
geom_line(color = "#00AFBB", size = 1) +
# coordinates
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("PTT 八卦版 討論文章數") +
xlab("日期") +
ylab("數量") +
# theme
theme() #加入中文字型設定,避免中文字顯示錯誤。
plot_date
<- rd %>%
data group_by(word) %>%
summarise(sum = sum(count), .groups = 'drop') %>%
arrange(desc(sum))
head(data)
## # A tibble: 6 x 2
## word sum
## <chr> <dbl>
## 1 台鐵 2600
## 2 事故 792
## 3 問題 759
## 4 工程 715
## 5 工程車 715
## 6 太魯閣 713
%>% filter(sum > 50) %>% wordcloud2() data
<- rd %>%
rd_tokens select(-artTime, -artUrl)
head(rd_tokens)
## # A tibble: 6 x 4
## artTitle artDate word count
## <chr> <date> <chr> <dbl>
## 1 [問卦]台鐵是在愚人節? 2021-03-31 沒開 3
## 2 [問卦]台鐵是在愚人節? 2021-03-31 今年 1
## 3 [問卦]台鐵是在愚人節? 2021-03-31 首波 1
## 4 [問卦]台鐵是在愚人節? 2021-03-31 玩笑 1
## 5 [問卦]台鐵是在愚人節? 2021-03-31 竟然 1
## 6 [問卦]台鐵是在愚人節? 2021-03-31 台鐵整 1
<- rd_tokens %>%
rd_tokens_by_date count(artDate, word, sort = TRUE) %>%
filter(n > 5)
<- scan(file = "dict/stop_words.txt", what=character(),sep='\n',
stop_words encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "dict/stop_words.txt", what = character(), sep = "\n", :
## 輸入連結 'dict/stop_words.txt' 中的輸入不正確
<- rd_tokens_by_date %>%
plot_merge filter(!(word %in% stop_words) & !(word %in% "台鐵")) %>%
filter(artDate == as.Date("2021-04-02") |
== as.Date("2021-04-03") |
artDate == as.Date("2021-04-04") |
artDate == as.Date("2021-04-05") |
artDate == as.Date("2021-04-06") |
artDate == as.Date("2021-04-07") |
artDate == as.Date("2021-04-08") |
artDate == as.Date("2021-04-09") |
artDate == as.Date("2021-04-10")) %>%
artDate group_by(artDate) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n, fill = artDate)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = NULL) +
facet_wrap(~artDate, scales="free", ncol = 5) +
coord_flip()+
theme(text = element_text())
plot_merge
# 正向字典txt檔
# 以,將字分隔
<- read_file("dict/positive.txt")
P
# 負向字典txt檔
<- read_file("dict/negative.txt") N
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
= strsplit(P, ",")[[1]]
P = strsplit(N, ",")[[1]]
N
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
= data.frame(word = P, sentiment = "positive")
P = data.frame(word = N, sentiment = "negative") N
= rbind(P, N) LIWC
%>%
rd_tokens_by_date inner_join(LIWC) %>%
select(word) %>%
inner_join(LIWC)
## Joining, by = "word"
## Joining, by = "word"
## # A tibble: 188 x 2
## word sentiment
## <chr> <chr>
## 1 事故 negative
## 2 問題 negative
## 3 問題 negative
## 4 事故 negative
## 5 事故 negative
## 6 問題 negative
## 7 安全 positive
## 8 八卦 negative
## 9 事故 negative
## 10 嚴重 negative
## # ... with 178 more rows
= rd_tokens_by_date %>%
sentiment_count select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
# 加入自定義的字典
<- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
jieba_tokenizer
# 設定斷詞function
<- function(t) {
chi_tokenizer lapply(t, function(x) {
if(nchar(x)>1){
<- segment(x, jieba_tokenizer)
tokens # 去掉字串長度爲1的詞彙
<- tokens[nchar(tokens)>1]
tokens return(tokens)
}
}) }
%>%
sentiment_count ggplot() +
geom_line(aes(x=artDate,y=count,colour=sentiment)) +
labs(x=NULL,y="數量")
scale_x_date(labels = date_format("%m/%d"))
## <ScaleContinuousDate>
## Range:
## Limits: 0 -- 1
<- rd2 %>%
rd_tokens_all unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
%>%
rd_tokens_all filter(artDate == as.Date("2021-04-02") |
== as.Date("2021-04-03") |
artDate == as.Date("2021-04-04") |
artDate == as.Date("2021-04-05") |
artDate == as.Date("2021-04-06") |
artDate == as.Date("2021-04-07") |
artDate == as.Date("2021-04-08") ) %>%
artDate inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
%>% data.frame() %>%
) top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
labs(x= "文字", y="數量") +
facet_wrap(~sentiment, scales = "free_y") +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
<- function(t) {
ngram_11 lapply(t, function(x) {
<- segment(x, jieba_tokenizer)
tokens <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
ngram unlist(ngram)
})
}<- rd2 %>%
g_ngram_11 select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
<- g_ngram_11 %>%
g_ngrams_11_separated separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
head(g_ngrams_11_separated)
## # A tibble: 6 x 12
## artUrl word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 https://w~ 獨家 最美 區間車~ 啟航 小 英 開箱 台 鐵 金牌 運轉
## 2 https://w~ 最美 區間車~ 啟航 小 英 開箱 台 鐵 金牌 運轉 手
## 3 https://w~ 區間車~ 啟航 小 英 開箱 台 鐵 金牌 運轉 手 護送
## 4 https://w~ 啟航 小 英 開箱 台 鐵 金牌 運轉 手 護送 親
## 5 https://w~ 小 英 開箱 台 鐵 金牌 運轉 手 護送 親 曝
## 6 https://w~ 英 開箱 台 鐵 金牌 運轉 手 護送 親 曝 新車
<- g_ngrams_11_separated %>%
g_check_words filter((word6 == "太魯閣"))
g_check_words
## # A tibble: 295 x 12
## artUrl word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 https://~ 說 年 春節 疏運 駕駛 太魯閣~ 號 遇上 電車 線 故障
## 2 https://~ 連假 第一天~ 台鐵今~ 上午 驚傳 太魯閣~ 號在 花蓮 清水 隧道 出軌
## 3 https://~ 連假台~ 鐵 工程 全 暫停 太魯閣~ 號卻 撞 工程車~ 出軌 今日
## 4 https://~ 運安處~ 出動 調 查 台鐵局~ 太魯閣~ 號是 撞 上台 鐵局 工程車
## 5 https://~ 盲 護航 急好 ㄇ 台鐵局~ 太魯閣~ 號是 撞 上台 鐵局 工程車
## 6 https://~ 鐵 局局長~ 請辭 這次 花蓮 太魯閣~ 出軌 由誰來~ 負責 司機 工程車
## 7 https://~ 工程車~ 正 東線 火車撞~ 班次 太魯閣~ 應該 第一輛~ 該處 前面 車次
## 8 https://~ 列車 是從 樹林 前往 台東 太魯閣~ 號 全車 搭車 旅客 約
## 9 https://~ 緊急 成立 一級 應變 小組 太魯閣~ 撞 工程車~ 出軌 圖 記者
## 10 https://~ 黃 彥傑 翻攝 原文 快訊 太魯閣~ 出軌 車頭 照片 曝光 高速
## # ... with 285 more rows
<- g_check_words %>%
g_check_words_count melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
%>%
g_check_words_count arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("出現在「太魯閣」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text())
<- function(t) {
ngram_11 lapply(t, function(x) {
<- segment(x, jieba_tokenizer)
tokens <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
ngram unlist(ngram)
})
}<- rd2 %>%
g_ngram_11 select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
<- g_ngram_11 %>%
g_ngrams_11_separated separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
g_ngrams_11_separated
## # A tibble: 107,241 x 12
## artUrl word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 https://~ 獨家 最美 區間車~ 啟航 小 英 開箱 台 鐵 金牌 運轉
## 2 https://~ 最美 區間車~ 啟航 小 英 開箱 台 鐵 金牌 運轉 手
## 3 https://~ 區間車~ 啟航 小 英 開箱 台 鐵 金牌 運轉 手 護送
## 4 https://~ 啟航 小 英 開箱 台 鐵 金牌 運轉 手 護送 親
## 5 https://~ 小 英 開箱 台 鐵 金牌 運轉 手 護送 親 曝
## 6 https://~ 英 開箱 台 鐵 金牌 運轉 手 護送 親 曝 新車
## 7 https://~ 開箱 台 鐵 金牌 運轉 手 護送 親 曝 新車 這裡
## 8 https://~ 台 鐵 金牌 運轉 手 護送 親 曝 新車 這裡 兇
## 9 https://~ 鐵 金牌 運轉 手 護送 親 曝 新車 這裡 兇 蘋果日報~
## 10 https://~ 金牌 運轉 手 護送 親 曝 新車 這裡 兇 蘋果日報~ 小時
## # ... with 107,231 more rows
<- g_ngrams_11_separated %>%
g_check_words filter((word6 == "台鐵"))
g_check_words
## # A tibble: 1,492 x 12
## artUrl word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 https://~ 年 技術 卻 好 脫穎而出~ 台鐵 內部 評價 則是 加速 減速
## 2 https://~ 開車 大學 畢業 後 鎖定 台鐵 報考 特考 期間 先到 日商
## 3 https://~ 一輛 工程車~ 翻覆 邊坡 旁 台鐵 目前 路線 暫以 單線 雙向
## 4 https://~ 干 連假 第一天~ 發生 事故 台鐵 以後 搭 旅客 好好 搭台
## 5 https://~ 生命 跡象 記者 李宜秦~ 台北 台鐵 驚傳 出軌 事故 交通部 台鐵局
## 6 https://~ 台 鐵 事故 不算 少 台鐵 工會 抱怨 何台 鐵 車次
## 7 https://~ 之後 繼續 高鐵 這方面~ 問題 台鐵 問題 在哪 薪水 票價 不足
## 8 https://~ 族 可說是~ 一大 佳音 相信 台鐵 服務 一定 能夠 變得 更好
## 9 https://~ 提升 行車 安全 她並 鼓勵 台鐵 同仁 展現 專 業 悠久
## 10 https://~ 晚上 火車 台北 台東 看到 台鐵 花蓮 出軌 想 請問 版上
## # ... with 1,482 more rows
<- g_check_words %>%
g_check_words_count melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
%>%
g_check_words_count arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("出現在「台鐵」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text())
# 進行斷詞,並計算各詞彙在各文章中出現的次數
<- rd2 %>%
rd_words unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
head(rd_words)
## # A tibble: 6 x 3
## artUrl word n
## <chr> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 廠商 95
## 2 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 採購 55
## 3 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 台鐵 44
## 4 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 體檢 39
## 5 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 列車 38
## 6 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 軌道 36
# 計算每篇文章包含的詞數
<- rd_words %>%
total_words group_by(artUrl) %>%
summarize(total = sum(n))
head(total_words)
## # A tibble: 6 x 2
## artUrl total
## <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1617207223.A.020.html 13
## 2 https://www.ptt.cc/bbs/Gossiping/M.1617249914.A.8DA.html 430
## 3 https://www.ptt.cc/bbs/Gossiping/M.1617329020.A.9C1.html 17
## 4 https://www.ptt.cc/bbs/Gossiping/M.1617329416.A.0EA.html 29
## 5 https://www.ptt.cc/bbs/Gossiping/M.1617330379.A.DDD.html 18
## 6 https://www.ptt.cc/bbs/Gossiping/M.1617331302.A.E6D.html 176
# 合併 mask_words(每個詞彙在每個文章中出現的次數)
# 與 total_words(每篇文章的詞數)
# 新增各個詞彙在所有詞彙中的總數欄位
<- left_join(rd_words, total_words) rd_words
## Joining, by = "artUrl"
head(rd_words)
## # A tibble: 6 x 4
## artUrl word n total
## <chr> <chr> <int> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 廠商 95 2410
## 2 https://www.ptt.cc/bbs/Gossiping/M.1617815608.A.2EA.html 採購 55 2410
## 3 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 台鐵 44 2454
## 4 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 體檢 39 2454
## 5 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 列車 38 2454
## 6 https://www.ptt.cc/bbs/Gossiping/M.1617434962.A.C57.html 軌道 36 2454
# 以每篇文章爲單位,計算每個詞彙在的tf-idf值
<- rd_words %>%
rd_words_tf_idf bind_tf_idf(word, artUrl, n)
#rd_words_tf_idf
# 選出每篇文章,tf-idf最大的十個詞
%>%
rd_words_tf_idf group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl))
## Selecting by tf_idf
## # A tibble: 16,479 x 7
## # Groups: artUrl [1,485]
## artUrl word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 https://www.ptt.cc/bbs/Gossiping/M.16~ 李義祥 3 26 0.115 1.84 0.213
## 2 https://www.ptt.cc/bbs/Gossiping/M.16~ 醜化 2 26 0.0769 7.30 0.562
## 3 https://www.ptt.cc/bbs/Gossiping/M.16~ 人格 1 26 0.0385 6.20 0.239
## 4 https://www.ptt.cc/bbs/Gossiping/M.16~ 利害關係~ 1 26 0.0385 6.61 0.254
## 5 https://www.ptt.cc/bbs/Gossiping/M.16~ 害人 1 26 0.0385 6.61 0.254
## 6 https://www.ptt.cc/bbs/Gossiping/M.16~ 追殺 1 26 0.0385 6.61 0.254
## 7 https://www.ptt.cc/bbs/Gossiping/M.16~ 揣測 1 26 0.0385 6.20 0.239
## 8 https://www.ptt.cc/bbs/Gossiping/M.16~ 無數 1 26 0.0385 5.69 0.219
## 9 https://www.ptt.cc/bbs/Gossiping/M.16~ 毀滅 1 26 0.0385 6.61 0.254
## 10 https://www.ptt.cc/bbs/Gossiping/M.16~ 網軍的 1 26 0.0385 6.20 0.239
## # ... with 16,469 more rows
# 選每篇文章,tf-idf最大的十個詞,
# 並查看每個詞被選中的次數
%>%
rd_words_tf_idf group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl)) %>%
ungroup() %>%
count(word, sort=TRUE)
## Selecting by tf_idf
## # A tibble: 10,754 x 2
## word n
## <chr> <int>
## 1 李義祥 54
## 2 改革 45
## 3 民營化 43
## 4 公司化 36
## 5 民進黨 31
## 6 高鐵 31
## 7 工程車 29
## 8 工程 27
## 9 工地 23
## 10 廠商 23
## # ... with 10,744 more rows
# 計算兩個詞彙間的相關性
<- scan(file = "dict/stop_words.txt", what=character(),sep='\n',
stop_words encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "dict/stop_words.txt", what = character(), sep = "\n", :
## 輸入連結 'dict/stop_words.txt' 中的輸入不正確
<- rd_words %>%
word_cors group_by(word) %>%
filter(n() >= 20) %>%
filter(!(word %in% stop_words)&&!(word %in% "記者")) %>%
pairwise_cor(word, artUrl, sort = TRUE)
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
#word_cors
# 與太魯閣相關性高的詞彙
%>%
word_cors filter(item1 == "太魯閣") %>%
head(5)
## # A tibble: 5 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 太魯閣 出軌 0.611
## 2 太魯閣 連結 0.515
## 3 太魯閣 事故 0.441
## 4 太魯閣 花蓮 0.395
## 5 太魯閣 滑落 0.394
# 與義祥相關性高的詞彙
%>%
word_cors filter(item1 == "義祥") %>%
head(5)
## # A tibble: 5 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 義祥 工業 0.603
## 2 義祥 負責人 0.504
## 3 義祥 李義祥 0.291
## 4 義祥 營造 0.283
## 5 義祥 地院 0.251
# 與台鐵相關性高的詞彙
%>%
word_cors filter(item1 == "台鐵") %>%
head(5)
## # A tibble: 5 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 台鐵 改革 0.160
## 2 台鐵 出軌 0.160
## 3 台鐵 指出 0.151
## 4 台鐵 事故 0.148
## 5 台鐵 民營化 0.143
# 分別尋找與 "台鐵", "太魯閣", "政府", "義祥" 相關性最高的 10 個詞彙
%>%
word_cors filter(item1 %in% c("台鐵", "太魯閣", "政府", "義祥")) %>%
group_by(item1) %>%
top_n(10) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
labs(x=NULL,y=NULL) +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text())
## Selecting by correlation
# 顯示相關性大於0.4的組合
set.seed(2020)
%>%
word_cors filter(correlation > 0.4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
# 顯示相關性大於0.5的組合
set.seed(2020)
%>%
word_cors filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
在整個事件中,大眾認為太魯閣號的責任應歸咎於台鐵和政府,並希望在太魯閣事件後台鐵能效法高鐵將公司民營化或公司化。