動機與分析目的:

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
# install.packages(c("curl", "httr"))

安裝需要的packages

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2","tidyr", "scales", "widyr", "readr", "reshape2", "NLP", "ggraph", "igraph", "tm", "data.table", "quanteda", "Matrix", "slam", "Rtsne", "randomcoloR", "wordcloud", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
require(widyr)
require(readr)
require(reshape2)
require(NLP)
require(ggraph)
require(igraph)
require(tm)
require(data.table)
require(quanteda)
require(Matrix)
require(slam)
require(Rtsne)
require(randomcoloR)
require(wordcloud)
require(topicmodels)
require(LDAvis)
require(webshot)
require(htmlwidgets)
require(servr)
library(ggplot2)

資料載入:本資料為2018/06/01 ~ 2020/04/20 PTT Movie、ChinaDrama、KoreaDrama、EAseries、TaiwanDrama Gossiping、Womentalk 之資料,透過文字分析平台檢索「Netflix」、「網飛」兩個關鍵字,共搜尋到49855篇文章。

m <- read_csv('./data/振興券_artWordFreq.csv')
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   word = col_character(),
##   count = col_double()
## )
mr <- read_csv('./data/振興券_articleReviews.csv')
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   cmtPoster = col_character(),
##   cmtStatus = col_character(),
##   cmtDate = col_datetime(format = ""),
##   cmtContent = col_character()
## )
mr$artDate = mr$artDate %>% as.Date("%Y/%m/%d")

預覽資料

head(mr)
## # A tibble: 6 x 10
##   artTitle artDate    artTime  artUrl artPoster artCat cmtPoster cmtStatus
##   <chr>    <date>     <time>   <chr>  <chr>     <chr>  <chr>     <chr>    
## 1 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ wensandra 推       
## 2 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ js52666   →        
## 3 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ jffry6663 →        
## 4 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ js52666   →        
## 5 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ wensandra →        
## 6 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ ah937609  →        
## # ... with 2 more variables: cmtDate <dttm>, cmtContent <chr>

斷詞、停用詞使用

jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}
n_tokens <- mr %>% unnest_tokens(word, cmtContent, token=tokenizer)
str(n_tokens)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 47464 obs. of  10 variables:
##  $ artTitle : chr  "[問卦]有沒有當年消費券怎麼用的八卦" "[問卦]有沒有當年消費券怎麼用的八卦" "[問卦]有沒有當年消費券怎麼用的八卦" "[問卦]有沒有當年消費券怎麼用的八卦" ...
##  $ artDate  : Date, format: "2017-01-17" "2017-01-17" ...
##  $ artTime  : 'hms' num  17:37:27 17:37:27 17:37:27 17:37:27 ...
##   ..- attr(*, "units")= chr "secs"
##  $ artUrl   : chr  "https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html" "https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html" "https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html" "https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html" ...
##  $ artPoster: chr  "kmjhome" "kmjhome" "kmjhome" "kmjhome" ...
##  $ artCat   : chr  "Gossiping" "Gossiping" "Gossiping" "Gossiping" ...
##  $ cmtPoster: chr  "wensandra" "wensandra" "wensandra" "js52666" ...
##  $ cmtStatus: chr  "推" "推" "推" "→" ...
##  $ cmtDate  : POSIXct, format: "2017-01-18 01:37:00" "2017-01-18 01:37:00" ...
##  $ word     : chr  "之前" "一次" "剛剛" "一樓" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   artTitle = col_character(),
##   ..   artDate = col_date(format = ""),
##   ..   artTime = col_time(format = ""),
##   ..   artUrl = col_character(),
##   ..   artPoster = col_character(),
##   ..   artCat = col_character(),
##   ..   cmtPoster = col_character(),
##   ..   cmtStatus = col_character(),
##   ..   cmtDate = col_datetime(format = ""),
##   ..   cmtContent = col_character()
##   .. )
head(n_tokens, 20)
## # A tibble: 20 x 10
##    artTitle artDate    artTime  artUrl artPoster artCat cmtPoster cmtStatus
##    <chr>    <date>     <time>   <chr>  <chr>     <chr>  <chr>     <chr>    
##  1 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ wensandra 推       
##  2 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ wensandra 推       
##  3 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ wensandra 推       
##  4 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ js52666   →        
##  5 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ jffry6663 →        
##  6 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ js52666   →        
##  7 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ js52666   →        
##  8 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ js52666   →        
##  9 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ ah937609  →        
## 10 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ Dooo      →        
## 11 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ Dooo      →        
## 12 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ Dooo      →        
## 13 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ nestea91~ 推       
## 14 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ nestea91~ 推       
## 15 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ nestea91~ 推       
## 16 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ GTA0328   推       
## 17 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ GTA0328   推       
## 18 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ GTA0328   推       
## 19 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ eric999   推       
## 20 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome   Gossi~ eric999   推       
## # ... with 2 more variables: cmtDate <dttm>, word <chr>

計算所有字在文集中的總詞頻

word_count <- n_tokens %>%
  #select(word) %>% 
  group_by(word) %>% 
  summarise(count = n())  %>%
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))
## `summarise()` ungrouping output (override with `.groups` argument)
head(word_count, 100)
## # A tibble: 100 x 2
##    word   count
##    <chr>  <int>
##  1 消費券   945
##  2 消費     412
##  3 政府     402
##  4 真的     376
##  5 台灣     345
##  6 消費卷   323
##  7 政策     261
##  8 馬英九   256
##  9 經濟     245
## 10 垃圾     243
## # ... with 90 more rows

討論度較高、較常出現的影集名稱為「毒梟」、「黑鏡」、「怪奇物語」、「罪夢者」等

文字雲

word_count %>% 
   filter(word !="消費券") %>%
   filter(word !="消費卷") %>%
   filter(count>200) %>%
   wordcloud2()

可大致上看出Netflix在ptt版上常和「電影」、「影集」、「韓劇」等關鍵字一同出現

長條圖

plot_merge <- word_count %>% 
  filter(word !="消費券") %>%
   filter(word !="消費卷") %>%
  #group_by(type) %>% 
  top_n(30, count) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(x=word, y=count)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y="詞頻") +
  theme(text=element_text(size=14, family = "Heiti TC Light"))+
  #facet_wrap(~type, ncol = 1, scales="free") + 
  coord_flip()

plot_merge
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

計算詞頻

# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count_by_date <- n_tokens %>%
  #filter(nchar(.$word)>1) %>%
  group_by(word, artDate) %>%
  #group_by(word) %>%
  summarise(sum = n()) %>%
  filter(sum>3) %>%
  arrange(desc(sum))
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
head(tokens_count_by_date)
## # A tibble: 6 x 3
## # Groups:   word [4]
##   word   artDate      sum
##   <chr>  <date>     <int>
## 1 消費券 2020-05-28   130
## 2 真的   2020-05-28   116
## 3 馬英九 2020-05-28    96
## 4 一家親 2020-04-17    84
## 5 消費券 2020-04-08    77
## 6 消費券 2020-04-04    68
# tokens_count_by_date <- n_tokens %>% 
#   group_by(artDate) %>% 
#   summarise(count = n())
# tokens_count_by_date

情緒分析

準備LIWC字典

以LIWC字典判斷文集中的word屬於正面字還是負面字

# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/liwc/positive.txt")

# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
#字典txt檔讀進來是一個字串
typeof(N)
## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

與LIWC情緒字典做join

文集中的字出現在LIWC字典中是屬於positive還是negative

tokens_count_by_date %>% inner_join(LIWC)
## Joining, by = "word"
## # A tibble: 150 x 4
## # Groups:   word [55]
##    word  artDate      sum sentiment
##    <chr> <date>     <int> <fct>    
##  1 幫助  2020-05-29    37 positive 
##  2 可憐  2020-05-28    33 negative 
##  3 失敗  2020-05-28    25 negative 
##  4 垃圾  2020-05-28    25 negative 
##  5 垃圾  2017-07-25    23 negative 
##  6 垃圾  2020-05-29    18 negative 
##  7 垃圾  2020-04-04    16 negative 
##  8 問題  2020-05-29    16 negative 
##  9 智障  2020-05-28    16 negative 
## 10 簡單  2020-05-28    15 positive 
## # ... with 140 more rows
# n_tokens %>% 
#    select(word) %>%
#    inner_join(LIWC)

# n_tokens_liwc <- n_tokens %>%
#   select(word) %>%
#   inner_join(LIWC)
#   #merge(x = n_tokens, y = LIWC, by = "word", all.y  = TRUE)
# n_tokens_liwc <- n_tokens_liwc %>%
#   left_join(n_tokens)

# n_tokens %>% 
#   select(word) %>%
#   inner_join(LIWC)

資料轉換為DTM

m_dtm <- m %>% cast_dtm(artUrl, word, count)
m_dtm
## <<DocumentTermMatrix (documents: 240, terms: 6056)>>
## Non-/sparse entries: 15432/1438008
## Sparsity           : 99%
## Maximal term length: 9
## Weighting          : term frequency (tf)
inspect(m_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 34/66
## Sparsity           : 66%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##                                                           Terms
## Docs                                                       八卦 大家 今天 年前
##   https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html    1    1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1484704029.A.D83.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484796648.A.AC8.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484809083.A.979.html    0    0    2    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484824813.A.E19.html    0    1    0    1
##   https://www.ptt.cc/bbs/Gossiping/M.1484825377.A.B6D.html    0    1    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484920512.A.E03.html    0    0    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1485424627.A.05E.html    0    1    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1489399885.A.E0F.html    0    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1490027352.A.ED0.html    1    1    0    0
##                                                           Terms
## Docs                                                       有沒有 怎麼 剛剛
##   https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html      1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1484704029.A.D83.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484796648.A.AC8.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484809083.A.979.html      0    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484824813.A.E19.html      0    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484825377.A.B6D.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484920512.A.E03.html      0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1485424627.A.05E.html      0    1    0
##   https://www.ptt.cc/bbs/Gossiping/M.1489399885.A.E0F.html      1    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1490027352.A.ED0.html      1    0    0
##                                                           Terms
## Docs                                                       消費 發現 當年
##   https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html    1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1484704029.A.D83.html    1    0    1
##   https://www.ptt.cc/bbs/Gossiping/M.1484796648.A.AC8.html    7    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484809083.A.979.html    7    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484824813.A.E19.html    2    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484825377.A.B6D.html    3    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1484920512.A.E03.html    6    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1485424627.A.05E.html    1    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1489399885.A.E0F.html    2    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1490027352.A.ED0.html    2    0    1

建立LDA模型

lda <- LDA(m_dtm, k = 2, control = list(seed = 2020))

ϕ Matrix

查看ϕ matrix (topic * term)

topics <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
topics
## # A tibble: 12,112 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     1 剛剛  8.43e- 4
##  2     2 剛剛  1.47e-16
##  3     1 發現  1.35e- 3
##  4     2 發現  1.66e- 3
##  5     1 年前  8.43e- 4
##  6     2 年前  2.86e-10
##  7     1 今天  1.75e- 3
##  8     2 今天  5.97e- 4
##  9     1 消費  4.59e- 2
## 10     2 消費  5.30e- 2
## # ... with 12,102 more rows

Topic代表字

top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

更多主題

ldas = c()
topics = c(2,3,10,25,36)
for(topic in topics){
 start_time <- Sys.time()
 lda <- LDA(m_dtm, k = topic, control = list(seed = 2020))
 ldas =c(ldas,lda)
 print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
 save(ldas,file = "ldas_result")
}
## [1] "2 topic(s) and use time is  1.57168412208557"
## [1] "3 topic(s) and use time is  3.23996615409851"
## [1] "10 topic(s) and use time is  13.5919880867004"
## [1] "25 topic(s) and use time is  51.4556488990784"
## [1] "36 topic(s) and use time is  1.45670191844304"

每個主題的結果

load("ldas_result")
ldas[[3]] 
## A LDA_VEM topic model with 10 topics.

透過perplexity找到最佳主題數

# topics = c(2,3,10,25,36)
# data_frame(k = topics,
#           perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
 # ggplot(aes(k, perplex)) +
  #geom_point() +
#  geom_line() +
 # labs(title = "Evaluating LDA topic models",
  #     subtitle = "Optimal number of topics (smaller is better)",
   #    x = "Number of topics",
    #   y = "Perplexity")

建立LDA模型-3 topics

lda <- LDA(m_dtm, k = 3, control = list(seed = 2020))

ϕ Matrix

查看ϕ matrix (topic * term)

topics <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics
## # A tibble: 18,168 x 3
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 剛剛  1.16e-  3
##  2     2 剛剛  1.20e- 27
##  3     3 剛剛  1.89e- 25
##  4     1 發現  1.20e-  3
##  5     2 發現  2.30e-  3
##  6     3 發現  9.60e-  4
##  7     1 年前  9.83e-  4
##  8     2 年前  1.92e-  4
##  9     3 年前  8.11e-141
## 10     1 今天  1.71e-  3
## # ... with 18,158 more rows

Topic代表字

top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

感覺三個主題比較好,還多一個花蓮

產生create LDAvis所需的json function

  • 此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。
topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
   ###以下function 用來解決,主題數多會出現NA的問題
  ## 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi)
{
  jensenShannon <- function(x, y) {
      m <- 0.5 * (x + y)
    lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
    rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
    0.5 * sum(lhs) + 0.5 * sum(rhs)
  }
  dist.mat <- proxy::dist(x = phi, method = jensenShannon)
  pca.fit <- stats::cmdscale(dist.mat, k = 2)
  data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}

    # Find required quantities
    phi <- as.matrix(posterior(fitted)$terms)
    theta <- as.matrix(posterior(fitted)$topics)
    vocab <- colnames(phi)
    term_freq <- slam::col_sums(doc_term)

    # Convert to json
    json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                            vocab = vocab,
                            doc.length = as.vector(table(doc_term$i)),
                            term.frequency = term_freq, mds.method = ls_LDA)

    return(json_lda)
}

LDA後續分析

  • 根據前面的探索之後,我們對於資料有更加了解,並且看完每個主題數的LDAvis之後,選定主題數3的結果來作後續的分析。
m_lda = ldas[[2]] ## 選定topic 為3 的結果

topics <- tidy(m_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics
## # A tibble: 18,168 x 3
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 剛剛  1.16e-  3
##  2     2 剛剛  1.20e- 27
##  3     3 剛剛  1.89e- 25
##  4     1 發現  1.20e-  3
##  5     2 發現  2.30e-  3
##  6     3 發現  9.60e-  4
##  7     1 年前  9.83e-  4
##  8     2 年前  1.92e-  4
##  9     3 年前  8.11e-141
## 10     1 今天  1.71e-  3
## # ... with 18,158 more rows

尋找Topic的代表字

  • 整理出每一個Topic中生成概率最高的10個詞彙。
top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) + 
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

  • 移除常出現、跨主題共享的詞彙。
remove_word = c("消費","發放","政府","經濟")
top_terms <- topics %>%
  filter(!term  %in% remove_word)%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

主題命名

topic_name = c("古今比較","疫情相關","旅遊相關")

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(m_lda)
doc_pro <- tmResult$topics 
dim(doc_pro)               # nDocs(DTM) distributions over K topics
## [1] 240   3

每篇文章都有topic的分佈,所以240筆的文章*3個主題

cbind Document 主題分佈

# get document topic proportions 
document_topics <- doc_pro[m$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
m_topic = cbind(m,document_topics_df)
# m_topic %>% head(10)

查看特定主題的文章

m_topic %>%
    arrange(desc(`古今比較`)) %>%head(10) 
##                                        artTitle    artDate  artTime
## 1  [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 2  [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 3  [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 4  [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 5  [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 6  [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 7  [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 8  [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 9  [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 10 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
##                                                      artUrl   word count
## 1  https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html   電腦    11
## 2  https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html   競賽     6
## 3  https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 蔡慧玲     5
## 4  https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html   全國     5
## 5  https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html   技能     5
## 6  https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html   證照     4
## 7  https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html   應用     4
## 8  https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html   希望     4
## 9  https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html   未來     4
## 10 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html   完整     3
##     古今比較     疫情相關     旅遊相關
## 1  0.9997869 0.0001065682 0.0001065682
## 2  0.9997869 0.0001065682 0.0001065682
## 3  0.9997869 0.0001065682 0.0001065682
## 4  0.9997869 0.0001065682 0.0001065682
## 5  0.9997869 0.0001065682 0.0001065682
## 6  0.9997869 0.0001065682 0.0001065682
## 7  0.9997869 0.0001065682 0.0001065682
## 8  0.9997869 0.0001065682 0.0001065682
## 9  0.9997869 0.0001065682 0.0001065682
## 10 0.9997869 0.0001065682 0.0001065682

了解主題在時間的變化

# m_topic[,c(7:16)] =sapply(m_topic[,c(7:16)] , as.numeric)
m_topic %>% 
  group_by(artDate = format(artDate,'%Y%m')) %>%
  dplyr::select(-count)%>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
 ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has been
## passed a tbl_df and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now deprecated
## as well. To continue using melt methods from reshape2 while both libraries are
## attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.).
## In the next version, this warning will become an error.

- 取資料量多的月份

m_topic %>%
  filter( format(artDate,'%Y%m') %in% c(201802,202004,202005))%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
 ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has been
## passed a tbl_df and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now deprecated
## as well. To continue using melt methods from reshape2 while both libraries are
## attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.).
## In the next version, this warning will become an error.

  • 以比例了解主題時間變化
m_topic %>%
  filter( format(artDate,'%Y%m') %in% c(201802,202004,202005))%>%
  dplyr::select(-count)%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
 ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has been
## passed a tbl_df and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now deprecated
## as well. To continue using melt methods from reshape2 while both libraries are
## attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.).
## In the next version, this warning will become an error.