系統設置

## [1] ""

## [1] "Chinese (Traditional)_Taiwan.950"

安裝需要的packages

packages = c("readr", "tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot", "purrr", "ramify", "RColorBrewer", "htmlwidgets", "servr", "igraph")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

# 載入packages
require(readr)
require(tm)
require(data.table)
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
require(LDAvis)
require(wordcloud2)
require(webshot)
require(htmlwidgets)
require(servr)
require(purrr)
require(ramify)
require(RColorBrewer)
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

讀取資料

載入資料

# 文章資料
Detention <- fread("Detention.csv", encoding = "UTF-8")
Detention$artDate = Detention$artDate %>% as.Date("%Y/%m/%d") # 將日期欄位格式由chr轉為date

# 回覆資料
Detention_review <- fread("Detention_articleReviews.csv", encoding = "UTF-8")

# 選取需要的欄位
Detention_review <- Detention_review %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)
Detention_review

##                                                          artUrl    cmtPoster
##     1: https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html   franz10123
##     2: https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html   kawazakiz2
##     3: https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html  Tchachavsky
##     4: https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html    dcoog7880
##     5: https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html AustinRivers
##    ---                                                                      
## 43771:     https://www.ptt.cc/bbs/movie/M.1581865305.A.72B.html   thinker123
## 43772:     https://www.ptt.cc/bbs/movie/M.1585740803.A.313.html     aehvtleo
## 43773:     https://www.ptt.cc/bbs/movie/M.1585740803.A.313.html  mysmalllamb
## 43774:     https://www.ptt.cc/bbs/movie/M.1586164182.A.DDF.html    engineer1
## 43775:     https://www.ptt.cc/bbs/movie/M.1586164182.A.DDF.html      Shawnny
##        cmtStatus                                      cmtContent
##     1:        推                                 :樓下噓撕裂族群
##     2:        推                               :教官：馬英九飾演
##     3:        推                         :抵制為什麼不找馬英九拉
##     4:        推                                           :推推
##     5:         →                           :馬英九演教官我就10刷
##    ---                                                          
## 43771:        噓                                           :哎啊
## 43772:         →                  :難怪覺得眼熟...這兩年前的片了
## 43773:        推 :不錯呀，本來台灣沒興趣的片都出籠了，還有日暮！
## 43774:        推                             :支持啊。人少就去看
## 43775:        推                                         :看都看

資料描述

透過中山管院文字分析平台在PTT八卦版和電影版，搜尋關鍵字「返校」
文章時間：2019-08-01 ~ 2020-05-10
總篇數：613篇

Detention %>% 
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot(aes(artDate, count))+
    geom_line(color = "blue", size = 1) +
    theme_classic()

可觀察資料主要分佈在「2019/09」返校電影上映之後

發文者數量

length(unique(Detention$artPoster))

## [1] 470

回覆者數量

length(unique(Detention_review$cmtPoster))

## [1] 12696

總參與人數量

allPoster <- c(Detention$artPoster, Detention_review$cmtPoster)
length(unique(allPoster))

## [1] 12909

Tokenization

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
Detention_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      stop_words <- c("可以","一個","沒有","覺得","我們","因為","就是","什麼")
      tokens <- filter_segment(tokens, stop_words)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

tokens <- Detention %>%
  unnest_tokens(word, sentence, token=Detention_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens %>% head(10)

## # A tibble: 10 x 3
##    artUrl                                                   word  count
##    <chr>                                                    <chr> <int>
##  1 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 一日      1
##  2 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 一段      1
##  3 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 一線      1
##  4 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 不會      1
##  5 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 中學      1
##  6 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 之前      1
##  7 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 內容      2
##  8 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 公關      1
##  9 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 升旗      1
## 10 https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html 心理      1

將資料轉換為Document Term Matrix (DTM)

Detention_dtm <- tokens %>% cast_dtm(artUrl, word, count)
Detention_dtm

## <<DocumentTermMatrix (documents: 612, terms: 19188)>>
## Non-/sparse entries: 87588/11655468
## Sparsity           : 99%
## Maximal term length: 7
## Weighting          : term frequency (tf)

inspect(Detention_dtm[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 14/86
## Sparsity           : 86%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## Sample             :
##                                                           Terms
## Docs                                                       一日 一段 一線 不會
##   https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html    1    1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1565853117.A.D62.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1565960010.A.608.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1566113530.A.383.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1566212248.A.420.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567267079.A.BDC.html    0    0    0    1
##   https://www.ptt.cc/bbs/Gossiping/M.1567507189.A.DF6.html    0    0    0    2
##   https://www.ptt.cc/bbs/Gossiping/M.1567574321.A.0D8.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567694438.A.6C7.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567735991.A.1FB.html    0    1    0    0
##                                                           Terms
## Docs                                                       中學 之前 內容 公關
##   https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html    1    1    2    1
##   https://www.ptt.cc/bbs/Gossiping/M.1565853117.A.D62.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1565960010.A.608.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1566113530.A.383.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1566212248.A.420.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567267079.A.BDC.html    0    1    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567507189.A.DF6.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567574321.A.0D8.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567694438.A.6C7.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567735991.A.1FB.html    0    0    0    0
##                                                           Terms
## Docs                                                       升旗 心理
##   https://www.ptt.cc/bbs/Gossiping/M.1565848968.A.0AA.html    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1565853117.A.D62.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1565960010.A.608.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1566113530.A.383.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1566212248.A.420.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567267079.A.BDC.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567507189.A.DF6.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567574321.A.0D8.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567694438.A.6C7.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1567735991.A.1FB.html    0    0

查看DTM矩陣，可以發現是個稀疏矩陣。

建立LDA模型

lda <- LDA(Detention_dtm, k = 2, control = list(seed = 2020))

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

topics <- tidy(lda, matrix = "beta") #在tidy function裡使用 beta 來取出 Phi 矩陣。
topics

## # A tibble: 38,376 x 3
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 一日  0.0000224
##  2     2 一日  0.0000261
##  3     1 一段  0.0000578
##  4     2 一段  0.000739 
##  5     1 一線  0.0000420
##  6     2 一線  0.0000800
##  7     1 不會  0.00144  
##  8     2 不會  0.00199  
##  9     1 中學  0.000810 
## 10     2 中學  0.000675 
## # ... with 38,366 more rows

從 topics 中可以得到「特定主題生成特定詞彙」的概率。

尋找Topic的代表字

top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

觀察上方的圖，較難看出兩者的差異，因此我們嘗試區分更多種的topics

LDAvis

json function

產生create LDAvis所需的json function
此function是將前面使用“LDA function”所建立的model，轉換為“LDAVis”套件的input格式。

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)

ls_LDA = function (phi)
{
  jensenShannon <- function(x, y) {
      m <- 0.5 * (x + y)
    lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
    rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
    0.5 * sum(lhs) + 0.5 * sum(rhs)
  }
  dist.mat <- proxy::dist(x = phi, method = jensenShannon)
  pca.fit <- stats::cmdscale(dist.mat, k = 2)
  data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}

    # Find required quantities
    phi <- as.matrix(posterior(fitted)$terms)
    theta <- as.matrix(posterior(fitted)$topics)
    vocab <- colnames(phi)
    term_freq <- slam::col_sums(doc_term)

    # Convert to json
    json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                            vocab = vocab,
                            doc.length = as.vector(table(doc_term$i)),
                            term.frequency = term_freq, mds.method = ls_LDA)

    return(json_lda)
}

產生LDAvis結果

topic_10 = ldas[[3]]
json_res <- topicmodels_json_ldavis(topic_10,Detention_dtm)

serVis(json_res,open.browser = T)

serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

LDA後續分析

根據前面的探索之後，我們對於資料有更加了解，並且看完每個主題數的LDAvis之後，選定主題數10的結果來作後續的分析

Detention_lda = ldas[[3]] ## 選定topic 為10 的結果

topics <- tidy(Detention_lda, matrix = "beta") # 注意，在tidyfunction裡面要使用"beta"來取出Phi矩陣。
topics

## # A tibble: 192,380 x 3
##    topic term     beta
##    <int> <chr>   <dbl>
##  1     1 大家  0.00344
##  2     2 大家  0.00214
##  3     3 大家  0.00239
##  4     4 大家  0.00163
##  5     5 大家  0.00109
##  6     6 大家  0.00184
##  7     7 大家  0.00273
##  8     8 大家  0.00111
##  9     9 大家  0.00317
## 10    10 大家  0.00302
## # ... with 192,370 more rows

尋找Topic的代表字

整理出每一個 Topic 中生成概率最高的10個詞彙。

top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

可以看到topic都被一開始所使用的搜尋關鍵字影響看不出每一群的差異。

移除常出現、跨主題共享的詞彙

remove_word = c("返校","電影","自己","遊戲")
top_terms <- topics %>%
  filter(!term  %in% remove_word)%>%
  group_by(topic) %>%
  top_n(8, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

可以較清楚看出每個主題主要在討論什麼

主題命名

topic_name = c("禁書",'劇情','兩岸歷史','政治、社會','電玩改編','None1','票房','金馬獎','None2','None3')

定義出7個主題：禁書、劇情、兩岸歷史、政治與社會、電玩改編、票房、金馬獎

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(Detention_lda)
doc_pro <- tmResult$topics 
dim(doc_pro)               # nDocs(DTM) distributions over K topics

## [1] 613  10

每篇文章都有topic的分佈，所以613筆的文章*10個主題

cbind Document 主題分佈

# get document topic proportions 
document_topics <- doc_pro[Detention$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
news_topic = cbind(Detention,document_topics_df)

查看特定主題的文章

透過找到特定文章的分佈進行排序之後，可以看到此主題的比重高的文章在討論什麼。

 news_topic %>%
    arrange(desc(`禁書`)) %>% head(10)

可以看到禁書這個主題主要在探討歷史上禁書、政府打壓文論自由的議題

了解主題在時間的變化

#news_topic[,c(11:20)] = sapply(news_topic[,c(11:20)] , as.numeric)

news_topic %>% 
  dplyr::select(-commentNum,-push,-boo)%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
 ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

移除資料較少的月份，以及None的主題

去除筆數少月份、及None的主題

news_topic %>%
  filter( !format(artDate,'%Y%m') %in% c(201912,202002,202003,202004))%>%
  dplyr::select(-None1,-commentNum,-push,-boo,-None2,-None3)%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

可以看出每個月的聲量，但不能很清楚出每個月的比例

以比例了解主題時間變化

news_topic %>%
  filter( !format(artDate,'%Y%m') %in% c(201912,202002,202003,202004))%>%
  dplyr::select(-None1,-commentNum,-push,-boo,-None2,-None3)%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
 ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
      scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

以「劇情」主題來看，在9月電影上映及隔年1月線上首映的時候，有較多比例的文章在討論劇情。
11月的金馬獎，返校成功抱回5個獎項，在該月「金馬獎」和「票房」主題占了很大一部分。

使用LDA分析「返校」於PTT上的資料

讀書會 Group 6

系統設置

安裝需要的packages

讀取資料

載入資料

資料描述

發文者數量

回覆者數量

總參與人數量

Tokenization

將資料轉換為Document Term Matrix (DTM)

建立LDA模型

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

尋找Topic的代表字

更多主題

載入每個主題的 LDA 結果

透過 perplexity 找到最佳主題數

LDAvis

json function

產生LDAvis結果

LDA後續分析

尋找Topic的代表字

移除常出現、跨主題共享的詞彙

主題命名

Document 主題分佈

cbind Document 主題分佈

查看特定主題的文章

了解主題在時間的變化

去除筆數少月份、及None的主題

以比例了解主題時間變化