分析PTT八卦版對太魯閣號出軌事件的文字資料
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
= c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr")
packages = as.character(installed.packages()[,1])
existing for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
install.packages('knitr')
require(data.table)
require(dplyr)
require(readr)
require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(NLP)
require(tidyr)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
library(data.table)
require(wordcloud2)
require(lubridate)
require(htmlwidgets)
require(webshot)
require(plotly)
require(RColorBrewer)
require(servr)
require(tm)
require(data.table)
require(stringr)
library(topicmodels)
require(LDAvis)
require(webshot)
<- read_csv("太魯閣事件_artWordFreq.csv") rd
-- Column specification ----------------------------------------------------------------------------------------
cols(
artTitle = col_character(),
artDate = col_date(format = ""),
artTime = col_time(format = ""),
artUrl = col_character(),
word = col_character(),
count = col_double()
)
<- read_csv("太魯閣事件_articleMetaData.csv") rd2
-- Column specification ----------------------------------------------------------------------------------------
cols(
artTitle = col_character(),
artDate = col_date(format = ""),
artTime = col_time(format = ""),
artUrl = col_character(),
artPoster = col_character(),
artCat = col_character(),
commentNum = col_double(),
push = col_double(),
boo = col_double(),
sentence = col_character()
)
$artDate <- rd$artDate %>% as.Date("%Y/%m/%d")
rd rd
# 加入自定義的字典
<- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
jieba_tokenizer
# 設定斷詞function
<- function(t) {
chi_tokenizer lapply(t, function(x) {
if(nchar(x)>1){
<- segment(x, jieba_tokenizer)
tokens # 去掉字串長度爲1的詞彙
<- tokens[nchar(tokens)>1]
tokens return(tokens)
}
}) }
<- rd %>%
data ::select(artDate, artUrl) %>%
dplyrdistinct()
<- data %>%
article_count_by_date group_by(artDate) %>%
summarise(count = n())
head(article_count_by_date, 20)
<-
plot_date # data
%>%
article_count_by_date # aesthetics
ggplot(aes(x = artDate, y = count)) +
# geometrics
geom_line(color = "#00AFBB", size = 1) +
# coordinates
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("PTT 八卦版 討論文章數") +
xlab("日期") +
ylab("數量") +
# theme
theme() #加入中文字型設定,避免中文字顯示錯誤。
plot_date
<- rd %>%
data group_by(word) %>%
summarise(sum = sum(count), .groups = 'drop') %>%
arrange(desc(sum))
head(data)
%>% filter(sum > 50) %>% wordcloud2() data
<- rd %>%
rd_tokens select(-artTime, -artUrl)
head(rd_tokens)
<- rd_tokens %>%
rd_tokens_by_date count(artDate, word, sort = TRUE) %>%
filter(n > 5)
rd_tokens_by_date
<- scan(file = "dict/stop_words.txt", what=character(),sep='\n',
stop_words encoding='utf-8',fileEncoding='utf-8')
<- rd_tokens_by_date %>%
plot_merge filter(!(word %in% stop_words) & !(word %in% "台鐵")) %>%
filter(artDate == as.Date("2021-04-02") |
== as.Date("2021-04-03") |
artDate == as.Date("2021-04-04") |
artDate == as.Date("2021-04-05") |
artDate == as.Date("2021-04-06") |
artDate == as.Date("2021-04-07") |
artDate == as.Date("2021-04-08") |
artDate == as.Date("2021-04-09") |
artDate == as.Date("2021-04-10")) %>%
artDate group_by(artDate) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n, fill = artDate)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = NULL) +
facet_wrap(~artDate, scales="free", ncol = 5) +
coord_flip()+
theme(text = element_text())
plot_merge
# 正向字典txt檔
# 以,將字分隔
<- read_file("dict/positive.txt")
P
# 負向字典txt檔
<- read_file("dict/negative.txt") N
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
= strsplit(P, ",")[[1]]
P = strsplit(N, ",")[[1]]
N
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
= data.frame(word = P, sentiment = "positive")
P = data.frame(word = N, sentiment = "negative") N
= rbind(P, N) LIWC
%>%
rd_tokens_by_date inner_join(LIWC) %>%
select(word) %>%
inner_join(LIWC)
Joining, by = "word"
Joining, by = "word"
= rd_tokens_by_date %>%
sentiment_count select(artDate,word,n) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(n))
Joining, by = "word"
%>%
sentiment_count ggplot() +
geom_line(aes(x=artDate,y=count,colour=sentiment)) +
labs(x=NULL,y="數量")
scale_x_date(labels = date_format("%m/%d"))
<ScaleContinuousDate>
Range:
Limits: 0 -- 1
<- rd2 %>%
rd_tokens_all unnest_tokens(word, sentence, token=chi_tokenizer) %>%
select(-artTime)
%>%
rd_tokens_all filter(artDate == as.Date("2021-04-02") |
== as.Date("2021-04-03") |
artDate == as.Date("2021-04-04") |
artDate == as.Date("2021-04-05") |
artDate == as.Date("2021-04-06") |
artDate == as.Date("2021-04-07") |
artDate == as.Date("2021-04-08") ) %>%
artDate inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
%>% data.frame() %>%
) top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
labs(x= "文字", y="數量") +
facet_wrap(~sentiment, scales = "free_y") +
theme(text=element_text(size=14))+
coord_flip()
Joining, by = "word"
<- function(t) {
ngram_11 lapply(t, function(x) {
<- segment(x, jieba_tokenizer)
tokens <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
ngram unlist(ngram)
})
}<- rd2 %>%
g_ngram_11 select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
<- g_ngram_11 %>%
g_ngrams_11_separated separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
head(g_ngrams_11_separated)
<- g_ngrams_11_separated %>%
g_check_words filter((word6 == "太魯閣"))
g_check_words
<- g_check_words %>%
g_check_words_count melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
%>%
g_check_words_count arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("出現在「太魯閣」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text())
<- function(t) {
ngram_11 lapply(t, function(x) {
<- segment(x, jieba_tokenizer)
tokens <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
ngram unlist(ngram)
})
}<- rd2 %>%
g_ngram_11 select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
<- g_ngram_11 %>%
g_ngrams_11_separated separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
g_ngrams_11_separated
<- g_ngrams_11_separated %>%
g_check_words filter((word6 == "台鐵"))
g_check_words
<- g_check_words %>%
g_check_words_count melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
%>%
g_check_words_count arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("出現在「台鐵」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text())
# 進行斷詞,並計算各詞彙在各文章中出現的次數
<- rd2 %>%
rd_words unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
head(rd_words)
# 計算每篇文章包含的詞數
<- rd_words %>%
total_words group_by(artUrl) %>%
summarize(total = sum(n))
head(total_words)
# 合併 mask_words(每個詞彙在每個文章中出現的次數)
# 與 total_words(每篇文章的詞數)
# 新增各個詞彙在所有詞彙中的總數欄位
<- left_join(rd_words, total_words) rd_words
Joining, by = "artUrl"
head(rd_words)
# 以每篇文章爲單位,計算每個詞彙在的tf-idf值
<- rd_words %>%
rd_words_tf_idf bind_tf_idf(word, artUrl, n)
#rd_words_tf_idf
# 選出每篇文章,tf-idf最大的十個詞
%>%
rd_words_tf_idf group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl))
Selecting by tf_idf
# 選每篇文章,tf-idf最大的十個詞,
# 並查看每個詞被選中的次數
%>%
rd_words_tf_idf group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl)) %>%
ungroup() %>%
count(word, sort=TRUE)
Selecting by tf_idf
# 計算兩個詞彙間的相關性
<- scan(file = "dict/stop_words.txt", what=character(),sep='\n',
stop_words encoding='utf-8',fileEncoding='utf-8')
<- rd_words %>%
word_cors group_by(word) %>%
filter(n() >= 20) %>%
filter(!(word %in% stop_words)&&!(word %in% "記者")) %>%
pairwise_cor(word, artUrl, sort = TRUE)
#word_cors
# 與太魯閣相關性高的詞彙
%>%
word_cors filter(item1 == "太魯閣") %>%
head(5)
# 與義祥相關性高的詞彙
%>%
word_cors filter(item1 == "義祥") %>%
head(5)
# 與台鐵相關性高的詞彙
%>%
word_cors filter(item1 == "台鐵") %>%
head(5)
# 分別尋找與 "台鐵", "太魯閣", "政府", "義祥" 相關性最高的 10 個詞彙
%>%
word_cors filter(item1 %in% c("台鐵", "太魯閣", "政府", "義祥")) %>%
group_by(item1) %>%
top_n(10) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
labs(x=NULL,y=NULL) +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text())
Selecting by correlation
# 顯示相關性大於0.4的組合
set.seed(2020)
%>%
word_cors filter(correlation > 0.4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
# 顯示相關性大於0.5的組合
set.seed(2020)
%>%
word_cors filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
從以上的分析結果可看出在整個事件中,大眾對此事件的負面情緒始終高於正面情緒,負面情緒主要源自於台鐵老舊鬆散的官僚組織文化和政府對於承包商的不當監督,明知義祥工業社前科累累卻依舊將標案外包給該公司,而正面情緒則是大眾希望台鐵能夠效法高鐵對台鐵進行改革,將組織民營化或公司化,避免未來再有同樣事件發生。