ch0.套件取得及資料載入

套件

library(data.table)
## Warning: package 'data.table' was built under R version 4.0.4
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.4
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.4
library(stringr)
library(tm)
## Warning: package 'tm' was built under R version 4.0.5
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(servr)
## Warning: package 'servr' was built under R version 4.0.5
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 4.0.5
library(purrr)
## Warning: package 'purrr' was built under R version 4.0.4
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
## 
##     transpose
require(RColorBrewer)
## Loading required package: RColorBrewer
require(tidyr)
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 4.0.4
require(servr)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

資料描述

透過中山管院文字分析平台,載入 PTT 八卦版之資料,搜尋關鍵字為「館長」,時間從2020/01/01到2020/12/31。

metadata <- fread("gym_articleMetaData.csv", encoding = "UTF-8")

可以看到討論在 8 月的數量遽增,因為館長在8月底遭到槍擊。

metadata %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
    geom_point()

#Ch1. Document Term Matrix (DTM)

資料前處理

移除PTT貼新聞時會出現的格式用字

metadata <- metadata %>% 
  mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|張貼問卦請注意|充實文章內容|是否有專板|本板並非萬能問板|一天只能張貼|自刪及被刪也算兩篇之內|超貼者將被水桶|本看板嚴格禁止政治問卦|發文問卦前請先仔細閱讀相關板規|未滿30繁體中文字水桶3個月|嚴重者以鬧板論", "", sentence))

bigram

jieba_tokenizer = worker()

# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

metadata_bigram <- metadata %>%
  unnest_tokens(bigram, sentence, token = jieba_bigram)

metadata_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  count(bigram, sort = TRUE)

trigram

jieba_trigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      ngram<- ngrams(unlist(tokens), 3)
      ngram <- lapply(ngram, paste, collapse = " ")
      unlist(ngram)
    }
  })
}

metadata_trigram <- metadata %>%
  unnest_tokens(ngrams, sentence, token = jieba_trigram)
metadata_trigram %>%
  filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
  count(ngrams, sort = TRUE)

Remove stop words in bigram

# load stop words
stop_words <- scan(file = "./stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "./stop_words.txt", what = character(), sep = "\n", : 輸
## 入連結 './stop_words.txt' 中的輸入不正確
metadata_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  count(word1, word2, sort = TRUE) %>%
  unite_("bigram", c("word1","word2"), sep=" ")

Remove the stopwords in trigram

metadata_trigram %>%
  filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
  separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
  count(word1, word2, word3, sort = TRUE) %>%
  unite_("ngrams", c("word1", "word2", "word3"), sep=" ")

使用自建字典及停用字字典

jieba_tokenizer = worker(user="word.txt",stop_word="stop_words.txt")
news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

計算每篇文章各token出現次數

tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter((!str_detect(word, regex("[0-9a-zA-Z]")))) %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens %>% head(100)

將資料轉換為Document Term Matrix (DTM)

dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm
## <<DocumentTermMatrix (documents: 3028, terms: 31325)>>
## Non-/sparse entries: 138880/94713220
## Sparsity           : 100%
## Maximal term length: 11
## Weighting          : term frequency (tf)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 12/88
## Sparsity           : 88%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##                                                           Terms
## Docs                                                       一天 一包 二月
##   https://www.ptt.cc/bbs/Gossiping/M.1577839129.A.52E.html    1    2    1
##   https://www.ptt.cc/bbs/Gossiping/M.1577846021.A.250.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577850635.A.8DA.html    0    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577855226.A.44A.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577870686.A.B76.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577872706.A.3BF.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577888419.A.A93.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577889244.A.526.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577890990.A.908.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577894589.A.09F.html    0    0    0
##                                                           Terms
## Docs                                                       二月份 不用 中時
##   https://www.ptt.cc/bbs/Gossiping/M.1577839129.A.52E.html      1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1577846021.A.250.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577850635.A.8DA.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577855226.A.44A.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577870686.A.B76.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577872706.A.3BF.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577888419.A.A93.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577889244.A.526.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577890990.A.908.html      0    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577894589.A.09F.html      0    0    0
##                                                           Terms
## Docs                                                       引發 手錶 加油 半年
##   https://www.ptt.cc/bbs/Gossiping/M.1577839129.A.52E.html    2    2    1    3
##   https://www.ptt.cc/bbs/Gossiping/M.1577846021.A.250.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577850635.A.8DA.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577855226.A.44A.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577870686.A.B76.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577872706.A.3BF.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577888419.A.A93.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577889244.A.526.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577890990.A.908.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1577894589.A.09F.html    0    0    0    0

ch2. 主題模型

建立LDA模型

lda <- LDA(dtm, k = 2, control = list(seed = 2021))
lda <- LDA(dtm, k = 2, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs") #調整alpha即delta
lda
## A LDA_Gibbs topic model with 2 topics.

利用LDA模型建立phi矩陣

topics_words <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words

尋找Topic的代表字

terms依照各主題的phi值由大到小排序,列出前10大

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  mutate(top_words = reorder_within(term,phi,topic)) %>%
  ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

#ch3. 尋找最佳主題數

建立更多主題的主題模型

嘗試2、3、4、5、6個主題數,將結果存起來,再做進一步分析。 此部分需要跑一段時間,已經將跑完的檔案存成ldas_result.rdata,可以直接載入

# ldas = c()
# topics = c(2,3,4,5,6)
# for(topic in topics){
#   start_time <- Sys.time()
#   lda <- LDA(dtm, k = topic, control = list(seed = 2021))
#   ldas =c(ldas,lda)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
# }

載入每個主題的LDA結果

load("ldas_result.rdata")

透過perplexity找到最佳主題數

topics = c(2,3,4,5,6)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

#if(!('ldatuning' %in% existing)){install.packages(ldatuning)}
library("ldatuning")
result <- FindTopicsNumber(
  dtm,
  topics = topics,
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 2020),
  mc.cores = 2L,
  verbose = TRUE
)
FindTopicsNumber_plot(result)

ch4. LDA分析

選定6個主題數的主題模型

the_lda = ldas[[3]] ## 選定topic 為 4 的結果
topics_words <- tidy(the_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(10)

terms依照各主題的phi值由大到小排序

topics_words %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

去除共通詞彙,

removed_word = c("館長","真的","應該","有沒有","今天","現在","看到","表示","覺得","直播","一堆","問題","八卦","陳之漢","萬元","告訴","知道","認為","這次","台灣","一下","一定","根本","東西","之前","有人","比較","直接","不知道","已經")

topics_words %>%
  filter(!term  %in% removed_word) %>%
  group_by(topic) %>%
  top_n(10, phi) %>%
  ungroup() %>%
  ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

主題命名

topics_name = c("館長與政客之互動","館長與他人之爭執","館長遭槍擊","館長事業之討論")

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)

現在我們看每一篇的文章分佈了!

查看特定主題的文章

  • 透過找到特定文章的分佈進行排序之後,可以看到此主題的比重高的文章在討論什麼。
news_topic %>%
  arrange(desc(`館長與政客之互動`)) %>%head(20)

news_topic %>%
  arrange(desc(`館長與他人之爭執`)) %>%head(20)

news_topic %>%
  arrange(desc(`館長遭槍擊`)) %>%head(20)

news_topic %>%
  arrange(desc(`館長事業之討論`)) %>%head(20)

了解主題在時間的變化

news_topic %>% 
  mutate(artDate = as.Date(artDate)) %>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate") %>%
  ggplot( aes(x=artDate, y=value, fill = variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors[c(1,3,5,7)])+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

### 以比例了解主題時間變化

news_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  #filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
    scale_fill_manual(values=mycolors[c(1,3,5,7,9)])+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

# > topic 館長與政客之互動:
#   02 館長直播嗆「拿刀拿槍,殺掉國民黨員」 詹江村提告恐嚇、煽動犯罪
#   04 廣告違規遭罰60萬 館長怒嗆柯文哲2022別找他
#   10 推「實坪計價」爆論戰 館長護高嘉瑜:徐國勇真是不要臉
#   10 柯文哲表態挺中天!曾為最大柯粉的館長坦言相當失望
#   11 館長痛批徐國勇治安沒做好
# 
# > topic 館長與他人之爭執:
#   02 館長辱罵吳宗憲挨告,同時在直播中出賣余天
#   08 放話「館長老婆會被處理」屁孩道歉後嗆網友:讓我沒工作就搶你家
#   12 館長辱罵羅文挨告
# 
# > topic 館長遭槍擊:
#   08 館長槍擊案,黃國昌擔任律師
# 
# > topic 館長事業之討論:
#   01 館長新網站被駭客擊潰 怒告求償「絕對數百萬起跳」

補充 - 不同訓練LDA模型套件

參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation

library(text2vec)
## Warning: package 'text2vec' was built under R version 4.0.5
## 
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
## 
##     perplexity
library(udpipe)
## Warning: package 'udpipe' was built under R version 4.0.5
jieba_tokenizer = worker(user="word.txt",stop_word="stop_words.txt")
news_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

tokens <- metadata %>%
  unnest_tokens(word, sentence, token=news_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))| str_detect(word, regex("[Aa][Zz]")))

建立DTM matrix

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)
## [1] 3028 1074

LDA 模型

set.seed(2019)

topic_n = 4

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)
## INFO  [23:21:40.235] early stopping at 190 iteration 
## INFO  [23:21:40.801] early stopping at 80 iteration

這個比topicmodels的package跑快超多倍

一樣可以用LDAvis的套件來看

lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字
##       [,1]   [,2]     [,3]   [,4]    
##  [1,] "館長" "館長"   "開槍" "陳之漢"
##  [2,] "網站" "台灣"   "槍手" "網友"  
##  [3,] "肌肉" "中國"   "槍擊" "被告"  
##  [4,] "健身" "民進黨" "律師" "臉書"  
##  [5,] "助理" "直播"   "警方" "台北"  
##  [6,] "格鬥" "國民黨" "黑道" "吳宗憲"
##  [7,] "訓練" "支持"   "台中" "網紅"  
##  [8,] "小宇" "余天"   "林口" "記者"  
##  [9,] "影片" "柯文哲" "館長" "道歉"  
## [10,] "專業" "立委"   "凌晨" "羅文"
lda_model$plot()
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)