分析PTT八卦版對台灣Me too事件的文字資料和社會網絡資料
背景與動機
研究目的
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(reshape2)
library(wordcloud2)
library(text2vec)
載入資料
data <- read_csv('2023_06_19.csv')
文章斷句
# # 文章斷句("\n\n"取代成"。")
ptt <- data %>%
mutate(artContent=gsub("[\n]{2,}", "。", artContent))
#
# # 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
ptt_artContent <- strsplit(data$artContent,"[。!;?!?;]")
#
# # 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
ptt_artContent <- data.frame(
artUrl = rep(ptt$artUrl, sapply(ptt_artContent, length)),
artContent = unlist(ptt_artContent)
) %>%
filter(!str_detect(artContent, regex("^(\t|\n| )*$")))
# 如果有\t或\n就去掉
ptt_artContent$artContent <- as.character(ptt_artContent$artContent)
library(DT)
## Warning: 套件 'DT' 是用 R 版本 4.4.1 來建造的
# 在這個區塊中顯示表格
datatable(ptt_artContent, options = list(scrollX = TRUE))
加入關鍵字
Key_wd <- c("Me too", "黃子佼", "性騷")
文章斷詞
#加入stop_words
jieba_tokenizer <- worker(stop_word = "stop-mac.txt")
new_user_word(jieba_tokenizer, Key_wd) #把key_point加入字典
## [1] TRUE
# 設定斷詞function
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if (!is.na(x) && nchar(x) > 1) { # 添加檢查 NA 的條件
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度為 1 的詞彙
tokens <- tokens[nchar(tokens) > 1]
return(tokens)
}
return(NULL) # 若條件不符合,返回 NULL
})
}
tokens_all <- ptt %>%
unnest_tokens(word, artContent, token=chi_tokenizer) %>%
select(-artDate)
# # 用剛剛初始化的斷詞器把sentence斷開
tokens <- ptt_artContent %>%
mutate(artContent = gsub("[[:punct:]]", "",artContent)) %>%
mutate(artContent = gsub("[0-9a-zA-Z]", "",artContent)) %>%
unnest_tokens(word, artContent, token=chi_tokenizer) %>%
count(artUrl, word) %>% # 計算每篇文章出現的字頻
rename(count=n)
datatable(tokens, options = list(scrollX = TRUE))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
清理斷詞結果
freq = 3
# 依據字頻挑字
reserved_word <- tokens %>%
group_by(word) %>%
count() %>%
filter(n > freq) %>%
unlist()
ptt_removed <- tokens %>%
filter(word %in% reserved_word)
#mask_dtm 裡面 nrow:幾篇文章 ; ncol:幾個字
ptt_dtm <- ptt_removed %>% cast_dtm(artUrl, word, count)
文字雲
cloud <- ptt_removed %>%
group_by(word) %>%
summarise(sum = sum(count), .groups = 'drop') %>%
arrange(desc(sum))
cloud %>% filter(sum > 50) %>% wordcloud2()