動機與分析目的:
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
# install.packages(c("curl", "httr"))
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2","tidyr", "scales", "widyr", "readr", "reshape2", "NLP", "ggraph", "igraph", "tm", "data.table", "quanteda", "Matrix", "slam", "Rtsne", "randomcoloR", "wordcloud", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
require(widyr)
require(readr)
require(reshape2)
require(NLP)
require(ggraph)
require(igraph)
require(tm)
require(data.table)
require(quanteda)
require(Matrix)
require(slam)
require(Rtsne)
require(randomcoloR)
require(wordcloud)
require(topicmodels)
require(LDAvis)
require(webshot)
require(htmlwidgets)
require(servr)
library(ggplot2)
資料載入:本資料為2018/06/01 ~ 2020/04/20 PTT Movie、ChinaDrama、KoreaDrama、EAseries、TaiwanDrama Gossiping、Womentalk 之資料,透過文字分析平台檢索「Netflix」、「網飛」兩個關鍵字,共搜尋到49855篇文章。
m <- read_csv('./data/振興券_artWordFreq.csv')
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## word = col_character(),
## count = col_double()
## )
mr <- read_csv('./data/振興券_articleReviews.csv')
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## cmtPoster = col_character(),
## cmtStatus = col_character(),
## cmtDate = col_datetime(format = ""),
## cmtContent = col_character()
## )
mr$artDate = mr$artDate %>% as.Date("%Y/%m/%d")
head(mr)
## # A tibble: 6 x 10
## artTitle artDate artTime artUrl artPoster artCat cmtPoster cmtStatus
## <chr> <date> <time> <chr> <chr> <chr> <chr> <chr>
## 1 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ wensandra 推
## 2 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ js52666 →
## 3 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ jffry6663 →
## 4 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ js52666 →
## 5 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ wensandra →
## 6 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ ah937609 →
## # ... with 2 more variables: cmtDate <dttm>, cmtContent <chr>
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
n_tokens <- mr %>% unnest_tokens(word, cmtContent, token=tokenizer)
str(n_tokens)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 47464 obs. of 10 variables:
## $ artTitle : chr "[問卦]有沒有當年消費券怎麼用的八卦" "[問卦]有沒有當年消費券怎麼用的八卦" "[問卦]有沒有當年消費券怎麼用的八卦" "[問卦]有沒有當年消費券怎麼用的八卦" ...
## $ artDate : Date, format: "2017-01-17" "2017-01-17" ...
## $ artTime : 'hms' num 17:37:27 17:37:27 17:37:27 17:37:27 ...
## ..- attr(*, "units")= chr "secs"
## $ artUrl : chr "https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html" "https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html" "https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html" "https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html" ...
## $ artPoster: chr "kmjhome" "kmjhome" "kmjhome" "kmjhome" ...
## $ artCat : chr "Gossiping" "Gossiping" "Gossiping" "Gossiping" ...
## $ cmtPoster: chr "wensandra" "wensandra" "wensandra" "js52666" ...
## $ cmtStatus: chr "推" "推" "推" "→" ...
## $ cmtDate : POSIXct, format: "2017-01-18 01:37:00" "2017-01-18 01:37:00" ...
## $ word : chr "之前" "一次" "剛剛" "一樓" ...
## - attr(*, "spec")=
## .. cols(
## .. artTitle = col_character(),
## .. artDate = col_date(format = ""),
## .. artTime = col_time(format = ""),
## .. artUrl = col_character(),
## .. artPoster = col_character(),
## .. artCat = col_character(),
## .. cmtPoster = col_character(),
## .. cmtStatus = col_character(),
## .. cmtDate = col_datetime(format = ""),
## .. cmtContent = col_character()
## .. )
head(n_tokens, 20)
## # A tibble: 20 x 10
## artTitle artDate artTime artUrl artPoster artCat cmtPoster cmtStatus
## <chr> <date> <time> <chr> <chr> <chr> <chr> <chr>
## 1 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ wensandra 推
## 2 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ wensandra 推
## 3 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ wensandra 推
## 4 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ js52666 →
## 5 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ jffry6663 →
## 6 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ js52666 →
## 7 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ js52666 →
## 8 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ js52666 →
## 9 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ ah937609 →
## 10 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ Dooo →
## 11 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ Dooo →
## 12 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ Dooo →
## 13 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ nestea91~ 推
## 14 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ nestea91~ 推
## 15 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ nestea91~ 推
## 16 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ GTA0328 推
## 17 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ GTA0328 推
## 18 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ GTA0328 推
## 19 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ eric999 推
## 20 [問卦]有沒有~ 2017-01-17 17:37:27 https~ kmjhome Gossi~ eric999 推
## # ... with 2 more variables: cmtDate <dttm>, word <chr>
word_count <- n_tokens %>%
#select(word) %>%
group_by(word) %>%
summarise(count = n()) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
## `summarise()` ungrouping output (override with `.groups` argument)
head(word_count, 100)
## # A tibble: 100 x 2
## word count
## <chr> <int>
## 1 消費券 945
## 2 消費 412
## 3 政府 402
## 4 真的 376
## 5 台灣 345
## 6 消費卷 323
## 7 政策 261
## 8 馬英九 256
## 9 經濟 245
## 10 垃圾 243
## # ... with 90 more rows
討論度較高、較常出現的影集名稱為「毒梟」、「黑鏡」、「怪奇物語」、「罪夢者」等
word_count %>%
filter(word !="消費券") %>%
filter(word !="消費卷") %>%
filter(count>200) %>%
wordcloud2()
可大致上看出Netflix在ptt版上常和「電影」、「影集」、「韓劇」等關鍵字一同出現
plot_merge <- word_count %>%
filter(word !="消費券") %>%
filter(word !="消費卷") %>%
#group_by(type) %>%
top_n(30, count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(x=word, y=count)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y="詞頻") +
theme(text=element_text(size=14, family = "Heiti TC Light"))+
#facet_wrap(~type, ncol = 1, scales="free") +
coord_flip()
plot_merge
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count_by_date <- n_tokens %>%
#filter(nchar(.$word)>1) %>%
group_by(word, artDate) %>%
#group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>3) %>%
arrange(desc(sum))
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
head(tokens_count_by_date)
## # A tibble: 6 x 3
## # Groups: word [4]
## word artDate sum
## <chr> <date> <int>
## 1 消費券 2020-05-28 130
## 2 真的 2020-05-28 116
## 3 馬英九 2020-05-28 96
## 4 一家親 2020-04-17 84
## 5 消費券 2020-04-08 77
## 6 消費券 2020-04-04 68
# tokens_count_by_date <- n_tokens %>%
# group_by(artDate) %>%
# summarise(count = n())
# tokens_count_by_date
以LIWC字典判斷文集中的word屬於正面字還是負面字
# 正向字典txt檔
# 以,將字分隔
P <- read_file("dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
#字典txt檔讀進來是一個字串
typeof(N)
## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
文集中的字出現在LIWC字典中是屬於positive還是negative
tokens_count_by_date %>% inner_join(LIWC)
## Joining, by = "word"
## # A tibble: 150 x 4
## # Groups: word [55]
## word artDate sum sentiment
## <chr> <date> <int> <fct>
## 1 幫助 2020-05-29 37 positive
## 2 可憐 2020-05-28 33 negative
## 3 失敗 2020-05-28 25 negative
## 4 垃圾 2020-05-28 25 negative
## 5 垃圾 2017-07-25 23 negative
## 6 垃圾 2020-05-29 18 negative
## 7 垃圾 2020-04-04 16 negative
## 8 問題 2020-05-29 16 negative
## 9 智障 2020-05-28 16 negative
## 10 簡單 2020-05-28 15 positive
## # ... with 140 more rows
# n_tokens %>%
# select(word) %>%
# inner_join(LIWC)
# n_tokens_liwc <- n_tokens %>%
# select(word) %>%
# inner_join(LIWC)
# #merge(x = n_tokens, y = LIWC, by = "word", all.y = TRUE)
# n_tokens_liwc <- n_tokens_liwc %>%
# left_join(n_tokens)
# n_tokens %>%
# select(word) %>%
# inner_join(LIWC)
m_dtm <- m %>% cast_dtm(artUrl, word, count)
m_dtm
## <<DocumentTermMatrix (documents: 240, terms: 6056)>>
## Non-/sparse entries: 15432/1438008
## Sparsity : 99%
## Maximal term length: 9
## Weighting : term frequency (tf)
inspect(m_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 34/66
## Sparsity : 66%
## Maximal term length: 3
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 八卦 大家 今天 年前
## https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html 1 1 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1484704029.A.D83.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1484796648.A.AC8.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1484809083.A.979.html 0 0 2 0
## https://www.ptt.cc/bbs/Gossiping/M.1484824813.A.E19.html 0 1 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1484825377.A.B6D.html 0 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1484920512.A.E03.html 0 0 1 0
## https://www.ptt.cc/bbs/Gossiping/M.1485424627.A.05E.html 0 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1489399885.A.E0F.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1490027352.A.ED0.html 1 1 0 0
## Terms
## Docs 有沒有 怎麼 剛剛
## https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html 1 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1484704029.A.D83.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1484796648.A.AC8.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1484809083.A.979.html 0 1 0
## https://www.ptt.cc/bbs/Gossiping/M.1484824813.A.E19.html 0 1 0
## https://www.ptt.cc/bbs/Gossiping/M.1484825377.A.B6D.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1484920512.A.E03.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1485424627.A.05E.html 0 1 0
## https://www.ptt.cc/bbs/Gossiping/M.1489399885.A.E0F.html 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1490027352.A.ED0.html 1 0 0
## Terms
## Docs 消費 發現 當年
## https://www.ptt.cc/bbs/Gossiping/M.1484703813.A.B6F.html 1 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1484704029.A.D83.html 1 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1484796648.A.AC8.html 7 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1484809083.A.979.html 7 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1484824813.A.E19.html 2 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1484825377.A.B6D.html 3 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1484920512.A.E03.html 6 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1485424627.A.05E.html 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1489399885.A.E0F.html 2 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1490027352.A.ED0.html 2 0 1
lda <- LDA(m_dtm, k = 2, control = list(seed = 2020))
topics <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
topics
## # A tibble: 12,112 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 剛剛 8.43e- 4
## 2 2 剛剛 1.47e-16
## 3 1 發現 1.35e- 3
## 4 2 發現 1.66e- 3
## 5 1 年前 8.43e- 4
## 6 2 年前 2.86e-10
## 7 1 今天 1.75e- 3
## 8 2 今天 5.97e- 4
## 9 1 消費 4.59e- 2
## 10 2 消費 5.30e- 2
## # ... with 12,102 more rows
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
ldas = c()
topics = c(2,3,10,25,36)
for(topic in topics){
start_time <- Sys.time()
lda <- LDA(m_dtm, k = topic, control = list(seed = 2020))
ldas =c(ldas,lda)
print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
save(ldas,file = "ldas_result")
}
## [1] "2 topic(s) and use time is 1.57168412208557"
## [1] "3 topic(s) and use time is 3.23996615409851"
## [1] "10 topic(s) and use time is 13.5919880867004"
## [1] "25 topic(s) and use time is 51.4556488990784"
## [1] "36 topic(s) and use time is 1.45670191844304"
load("ldas_result")
ldas[[3]]
## A LDA_VEM topic model with 10 topics.
# topics = c(2,3,10,25,36)
# data_frame(k = topics,
# perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
# ggplot(aes(k, perplex)) +
#geom_point() +
# geom_line() +
# labs(title = "Evaluating LDA topic models",
# subtitle = "Optimal number of topics (smaller is better)",
# x = "Number of topics",
# y = "Perplexity")
lda <- LDA(m_dtm, k = 3, control = list(seed = 2020))
topics <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics
## # A tibble: 18,168 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 剛剛 1.16e- 3
## 2 2 剛剛 1.20e- 27
## 3 3 剛剛 1.89e- 25
## 4 1 發現 1.20e- 3
## 5 2 發現 2.30e- 3
## 6 3 發現 9.60e- 4
## 7 1 年前 9.83e- 4
## 8 2 年前 1.92e- 4
## 9 3 年前 8.11e-141
## 10 1 今天 1.71e- 3
## # ... with 18,158 more rows
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
感覺三個主題比較好,還多一個花蓮
topicmodels_json_ldavis <- function(fitted, doc_term){
require(LDAvis)
require(slam)
###以下function 用來解決,主題數多會出現NA的問題
## 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi)
{
jensenShannon <- function(x, y) {
m <- 0.5 * (x + y)
lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
0.5 * sum(lhs) + 0.5 * sum(rhs)
}
dist.mat <- proxy::dist(x = phi, method = jensenShannon)
pca.fit <- stats::cmdscale(dist.mat, k = 2)
data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(doc_term)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
doc.length = as.vector(table(doc_term$i)),
term.frequency = term_freq, mds.method = ls_LDA)
return(json_lda)
}
m_lda = ldas[[2]] ## 選定topic 為3 的結果
topics <- tidy(m_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics
## # A tibble: 18,168 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 剛剛 1.16e- 3
## 2 2 剛剛 1.20e- 27
## 3 3 剛剛 1.89e- 25
## 4 1 發現 1.20e- 3
## 5 2 發現 2.30e- 3
## 6 3 發現 9.60e- 4
## 7 1 年前 9.83e- 4
## 8 2 年前 1.92e- 4
## 9 3 年前 8.11e-141
## 10 1 今天 1.71e- 3
## # ... with 18,158 more rows
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
remove_word = c("消費","發放","政府","經濟")
top_terms <- topics %>%
filter(!term %in% remove_word)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
topic_name = c("古今比較","疫情相關","旅遊相關")
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(m_lda)
doc_pro <- tmResult$topics
dim(doc_pro) # nDocs(DTM) distributions over K topics
## [1] 240 3
每篇文章都有topic的分佈,所以240筆的文章*3個主題
# get document topic proportions
document_topics <- doc_pro[m$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
m_topic = cbind(m,document_topics_df)
# m_topic %>% head(10)
m_topic %>%
arrange(desc(`古今比較`)) %>%head(10)
## artTitle artDate artTime
## 1 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 2 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 3 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 4 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 5 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 6 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 7 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 8 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 9 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## 10 [新聞]消費券買電腦奠基礎 高中少女擁30UP證照 2018-01-06 16:43:19
## artUrl word count
## 1 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 電腦 11
## 2 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 競賽 6
## 3 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 蔡慧玲 5
## 4 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 全國 5
## 5 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 技能 5
## 6 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 證照 4
## 7 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 應用 4
## 8 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 希望 4
## 9 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 未來 4
## 10 https://www.ptt.cc/bbs/Gossiping/M.1515286161.A.057.html 完整 3
## 古今比較 疫情相關 旅遊相關
## 1 0.9997869 0.0001065682 0.0001065682
## 2 0.9997869 0.0001065682 0.0001065682
## 3 0.9997869 0.0001065682 0.0001065682
## 4 0.9997869 0.0001065682 0.0001065682
## 5 0.9997869 0.0001065682 0.0001065682
## 6 0.9997869 0.0001065682 0.0001065682
## 7 0.9997869 0.0001065682 0.0001065682
## 8 0.9997869 0.0001065682 0.0001065682
## 9 0.9997869 0.0001065682 0.0001065682
## 10 0.9997869 0.0001065682 0.0001065682
# m_topic[,c(7:16)] =sapply(m_topic[,c(7:16)] , as.numeric)
m_topic %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
dplyr::select(-count)%>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has been
## passed a tbl_df and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now deprecated
## as well. To continue using melt methods from reshape2 while both libraries are
## attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.).
## In the next version, this warning will become an error.
- 取資料量多的月份
m_topic %>%
filter( format(artDate,'%Y%m') %in% c(201802,202004,202005))%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has been
## passed a tbl_df and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now deprecated
## as well. To continue using melt methods from reshape2 while both libraries are
## attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.).
## In the next version, this warning will become an error.
m_topic %>%
filter( format(artDate,'%Y%m') %in% c(201802,202004,202005))%>%
dplyr::select(-count)%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has been
## passed a tbl_df and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now deprecated
## as well. To continue using melt methods from reshape2 while both libraries are
## attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.).
## In the next version, this warning will become an error.