資料為全部時間,抓到2/16,總共約77740筆文章,過濾掉公告系列文章後約72000筆。

# 讀取已經抓好的資料
df <- read_csv("ptt_hate_1_3990.csv")
## Parsed with column specification:
## cols(
##   author = col_character(),
##   content = col_character(),
##   date = col_datetime(format = ""),
##   title = col_character(),
##   url = col_character()
## )
# 先過濾掉NA
df <- df[complete.cases(df),]
head(df)
## # A tibble: 6 x 5
##   author      content               date                title      url          
##   <chr>       <chr>                 <dttm>              <chr>      <chr>        
## 1 Kaohsiung8… 哇勒,此人混蛋至極,叫他野蠻室友太客氣了… 2003-04-06 20:45:59 恨!我的野蠻室友… https://www.…
## 2 YICHANCHUN… 根本就是把庾澄慶踢掉的超級星期天嘛 感覺… 2003-04-06 21:35:45 什麼快樂星期天阿… https://www.…
## 3 Evance (嘆) 只能夠被動的拒絕面對 不能讓他遠離我的世… 2003-04-06 21:51:59 我好懦弱   https://www.…
## 4 lightthefi… 幹你媽的樓下死小孩 你亂按什麼電玲你不隻… 2003-04-06 22:18:11 幹你娘基八亂按什麼… https://www.…
## 5 geju (逝水) 常常在疑惑,到底什麼是對?什麼又是錯呢?… 2003-05-18 10:15:18 [問題] 來自朋友… https://www.…
## 6 alstonfju … 幹你媽的..說那什麼白目話.. 我們都沒… 2003-05-18 15:29:52 他媽的東洋倭寇死色… https://www.…
df_sentences <-  df %>% 
  # 過濾掉公告系列貼文
  filter(!str_detect(title, regex("\\[公告\\] "))) %>% 
  # 刪除 emoji 標點符號
  mutate(sentence = gsub("(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])", "", content)) %>% 
  # 刪除非 UTF-8 編碼格式
  mutate(sentence = gsub("[^[:alnum:][:blank:]?&/\\-]", "", sentence),
         content = NULL) %>% 
  # 過濾tab與換行符號
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
jieba_tokenizer <- worker()

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    # 過濾掉只有出現一次的字,保留研究用字「我」
    tokens <- tokens[nchar(tokens) > 1 | tokens %in% '我']
    return(tokens)
  })
}
tokens <- df_sentences %>% 
  # 斷詞
  unnest_tokens(word, sentence, token = chi_tokenizer) %>% 
  # 過濾英文與數字
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  # 資料格式轉換
  mutate(word = as.character(word)) %>% 
  # 計算每篇文章字頻
  count(url, word) %>%
  rename(count = n)
# 停用字
stopwords <- scan(file = paste0(stopword_path, "stopwords_tc.txt"), what = character(), sep = "\n", encoding = "utf-8", fileEncoding = "utf-8")
stopwords_df <- data.frame(word=stopwords)
word_count <- tokens %>%
  # 去除stopwords
  anti_join(stopwords_df) %>% 
  select(word, count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count > 10)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
# 文字雲
wordcloud2(word_count, minSize = 2)

今天一直出現的頻率很高,猜測可能很多人上hate版都是抱怨今天的事。 也很常出現一些負面用詞,例如媽的、他媽的、垃圾等
常見討論的主題有朋友、工作、問題等

列出最常出現的字

word_count %>% arrange(desc(count))
## # A tibble: 22,225 x 2
##    word  count
##    <chr> <int>
##  1 真的  25682
##  2 知道  22182
##  3 現在  16530
##  4 幹幹  16477
##  5 發財  13956
##  6 一直  13729
##  7 今天  13627
##  8 覺得  12875
##  9 看到   9218
## 10 東西   8780
## # … with 22,215 more rows

顯示tf-idf高到低的字

tokens_tf_idf <- tokens %>% 
  bind_tf_idf(word, url, count) %>% 
  arrange(desc(tf_idf))
tokens_tf_idf %>% select(word, tf_idf)
## # A tibble: 2,546,654 x 2
##    word   tf_idf
##    <chr>   <dbl>
##  1 材導    11.2 
##  2 努怒    11.2 
##  3 雷屁    10.5 
##  4 接招    10.1 
##  5 讀研    10.1 
##  6 駿駿     9.10
##  7 南山     8.99
##  8 廢文寫   8.38
##  9 呻吟     8.35
## 10 你轉     8.35
## # … with 2,546,644 more rows
tokens_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  top_n(30) %>% 
  ggplot(aes(word, tf_idf)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  coord_flip()
## Selecting by tf_idf

tokens_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word))))
## # A tibble: 2,546,654 x 6
##    url                                            word  count    tf   idf tf_idf
##    <chr>                                          <fct> <int> <dbl> <dbl>  <dbl>
##  1 https://www.ptt.cc/bbs/Hate/M.1551427381.A.25… 材導      1 1     11.2   11.2 
##  2 https://www.ptt.cc/bbs/Hate/M.1578633609.A.A4… 努怒     12 1     11.2   11.2 
##  3 https://www.ptt.cc/bbs/Hate/M.1556055377.A.92… 雷屁      1 1     10.5   10.5 
##  4 https://www.ptt.cc/bbs/Hate/M.1117643830.A.AC… 接招      1 1     10.1   10.1 
##  5 https://www.ptt.cc/bbs/Hate/M.1581023808.A.C2… 讀研      1 1     10.1   10.1 
##  6 https://www.ptt.cc/bbs/Hate/M.1551009033.A.12… 駿駿      1 1      9.10   9.10
##  7 https://www.ptt.cc/bbs/Hate/M.1552828036.A.DC… 南山      1 1      8.99   8.99
##  8 https://www.ptt.cc/bbs/Hate/M.1550075145.A.6D… 廢文寫…    64 0.831 10.1    8.38
##  9 https://www.ptt.cc/bbs/Hate/M.1554557736.A.8C… 呻吟      1 1      8.35   8.35
## 10 https://www.ptt.cc/bbs/Hate/M.1571542392.A.82… 你轉      1 1      8.35   8.35
## # … with 2,546,644 more rows