資料為全部時間,抓到2/16,總共約77740筆文章,過濾掉公告系列文章後約72000筆。
# 讀取已經抓好的資料
df <- read_csv("ptt_hate_1_3990.csv")
## Parsed with column specification:
## cols(
## author = col_character(),
## content = col_character(),
## date = col_datetime(format = ""),
## title = col_character(),
## url = col_character()
## )
# 先過濾掉NA
df <- df[complete.cases(df),]
head(df)
## # A tibble: 6 x 5
## author content date title url
## <chr> <chr> <dttm> <chr> <chr>
## 1 Kaohsiung8… 哇勒,此人混蛋至極,叫他野蠻室友太客氣了… 2003-04-06 20:45:59 恨!我的野蠻室友… https://www.…
## 2 YICHANCHUN… 根本就是把庾澄慶踢掉的超級星期天嘛 感覺… 2003-04-06 21:35:45 什麼快樂星期天阿… https://www.…
## 3 Evance (嘆) 只能夠被動的拒絕面對 不能讓他遠離我的世… 2003-04-06 21:51:59 我好懦弱 https://www.…
## 4 lightthefi… 幹你媽的樓下死小孩 你亂按什麼電玲你不隻… 2003-04-06 22:18:11 幹你娘基八亂按什麼… https://www.…
## 5 geju (逝水) 常常在疑惑,到底什麼是對?什麼又是錯呢?… 2003-05-18 10:15:18 [問題] 來自朋友… https://www.…
## 6 alstonfju … 幹你媽的..說那什麼白目話.. 我們都沒… 2003-05-18 15:29:52 他媽的東洋倭寇死色… https://www.…
df_sentences <- df %>%
# 過濾掉公告系列貼文
filter(!str_detect(title, regex("\\[公告\\] "))) %>%
# 刪除 emoji 標點符號
mutate(sentence = gsub("(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])", "", content)) %>%
# 刪除非 UTF-8 編碼格式
mutate(sentence = gsub("[^[:alnum:][:blank:]?&/\\-]", "", sentence),
content = NULL) %>%
# 過濾tab與換行符號
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
jieba_tokenizer <- worker()
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
# 過濾掉只有出現一次的字,保留研究用字「我」
tokens <- tokens[nchar(tokens) > 1 | tokens %in% '我']
return(tokens)
})
}
tokens <- df_sentences %>%
# 斷詞
unnest_tokens(word, sentence, token = chi_tokenizer) %>%
# 過濾英文與數字
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
# 資料格式轉換
mutate(word = as.character(word)) %>%
# 計算每篇文章字頻
count(url, word) %>%
rename(count = n)
# 停用字
stopwords <- scan(file = paste0(stopword_path, "stopwords_tc.txt"), what = character(), sep = "\n", encoding = "utf-8", fileEncoding = "utf-8")
stopwords_df <- data.frame(word=stopwords)
word_count <- tokens %>%
# 去除stopwords
anti_join(stopwords_df) %>%
select(word, count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count > 10)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
# 文字雲
wordcloud2(word_count, minSize = 2)
今天一直出現的頻率很高,猜測可能很多人上hate版都是抱怨今天的事。 也很常出現一些負面用詞,例如媽的、他媽的、垃圾等列出最常出現的字
word_count %>% arrange(desc(count))
## # A tibble: 22,225 x 2
## word count
## <chr> <int>
## 1 真的 25682
## 2 知道 22182
## 3 現在 16530
## 4 幹幹 16477
## 5 發財 13956
## 6 一直 13729
## 7 今天 13627
## 8 覺得 12875
## 9 看到 9218
## 10 東西 8780
## # … with 22,215 more rows
顯示tf-idf高到低的字
tokens_tf_idf <- tokens %>%
bind_tf_idf(word, url, count) %>%
arrange(desc(tf_idf))
tokens_tf_idf %>% select(word, tf_idf)
## # A tibble: 2,546,654 x 2
## word tf_idf
## <chr> <dbl>
## 1 材導 11.2
## 2 努怒 11.2
## 3 雷屁 10.5
## 4 接招 10.1
## 5 讀研 10.1
## 6 駿駿 9.10
## 7 南山 8.99
## 8 廢文寫 8.38
## 9 呻吟 8.35
## 10 你轉 8.35
## # … with 2,546,644 more rows
tokens_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
top_n(30) %>%
ggplot(aes(word, tf_idf)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
coord_flip()
## Selecting by tf_idf
tokens_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word))))
## # A tibble: 2,546,654 x 6
## url word count tf idf tf_idf
## <chr> <fct> <int> <dbl> <dbl> <dbl>
## 1 https://www.ptt.cc/bbs/Hate/M.1551427381.A.25… 材導 1 1 11.2 11.2
## 2 https://www.ptt.cc/bbs/Hate/M.1578633609.A.A4… 努怒 12 1 11.2 11.2
## 3 https://www.ptt.cc/bbs/Hate/M.1556055377.A.92… 雷屁 1 1 10.5 10.5
## 4 https://www.ptt.cc/bbs/Hate/M.1117643830.A.AC… 接招 1 1 10.1 10.1
## 5 https://www.ptt.cc/bbs/Hate/M.1581023808.A.C2… 讀研 1 1 10.1 10.1
## 6 https://www.ptt.cc/bbs/Hate/M.1551009033.A.12… 駿駿 1 1 9.10 9.10
## 7 https://www.ptt.cc/bbs/Hate/M.1552828036.A.DC… 南山 1 1 8.99 8.99
## 8 https://www.ptt.cc/bbs/Hate/M.1550075145.A.6D… 廢文寫… 64 0.831 10.1 8.38
## 9 https://www.ptt.cc/bbs/Hate/M.1554557736.A.8C… 呻吟 1 1 8.35 8.35
## 10 https://www.ptt.cc/bbs/Hate/M.1571542392.A.82… 你轉 1 1 8.35 8.35
## # … with 2,546,644 more rows