###LDA分析
require(readr)
require(tm)
require(data.table)
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
require(LDAvis)
require(wordcloud2)
require(webshot)
require(htmlwidgets)
require(servr)
require(purrr)
require(ramify)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
tsai_data <- fread("蔡英文_articleMetaData.csv", encoding = "UTF-8")
hen_data <- fread("韓國瑜_articleMetaData.csv", encoding = "UTF-8")
tsai_data$artDate <- tsai_data$artDate %>% as.Date("%Y/%m/%d")
hen_data$artDate <- hen_data$artDate %>% as.Date("%Y/%m/%d")
tsai_data_ori = tsai_data
hen_data_ori = hen_data
total = rbind(tsai_data,hen_data)
total$artDate = as.Date(total$artDate)
total %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="red")+
geom_point()
>2020年1月11日的貼文數與留言數最多
#jieba_tokenizer <- worker()
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
#new_user_word(jieba_tokenizer, user_dict)
data_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
res <- filter_segment(tokens, stop_words)
return(res)
})
}
tsai_data <- tsai_data %>%
unnest_tokens(word, sentence, data_tokenizer)
hen_data <- hen_data %>%
unnest_tokens(word, sentence, data_tokenizer)
tsai_data <- tsai_data %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
group_by(artUrl,word) %>%
summarise(
count = n()
)%>%
filter(nchar(word)>1)
hen_data <- hen_data %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
group_by(artUrl,word) %>%
summarise(
count = n()
)%>%
filter(nchar(word)>1)
tsai_dtm <- tsai_data %>% cast_dtm(artUrl, word, count)
inspect(tsai_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 14/86
## Sparsity : 86%
## Maximal term length: 2
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 一再 一致 一席
## https://www.ptt.cc/bbs/Gossiping/M.1564657817.A.607.html 1 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1564659840.A.CF6.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564659911.A.A2B.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564661433.A.108.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564661463.A.533.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564662034.A.C51.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663040.A.E77.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663326.A.964.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663456.A.64B.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663885.A.993.html 0 0 0
## Terms
## Docs 一條 力量 大選
## https://www.ptt.cc/bbs/Gossiping/M.1564657817.A.607.html 1 3 1
## https://www.ptt.cc/bbs/Gossiping/M.1564659840.A.CF6.html 0 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1564659911.A.A2B.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564661433.A.108.html 0 0 2
## https://www.ptt.cc/bbs/Gossiping/M.1564661463.A.533.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564662034.A.C51.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663040.A.E77.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663326.A.964.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663456.A.64B.html 0 2 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663885.A.993.html 0 0 0
## Terms
## Docs 不利 不夠 之前
## https://www.ptt.cc/bbs/Gossiping/M.1564657817.A.607.html 1 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1564659840.A.CF6.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564659911.A.A2B.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564661433.A.108.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564661463.A.533.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564662034.A.C51.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663040.A.E77.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663326.A.964.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663456.A.64B.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663885.A.993.html 0 0 1
## Terms
## Docs 之間
## https://www.ptt.cc/bbs/Gossiping/M.1564657817.A.607.html 1
## https://www.ptt.cc/bbs/Gossiping/M.1564659840.A.CF6.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1564659911.A.A2B.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1564661433.A.108.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1564661463.A.533.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1564662034.A.C51.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663040.A.E77.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663326.A.964.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663456.A.64B.html 0
## https://www.ptt.cc/bbs/Gossiping/M.1564663885.A.993.html 0
hen_dtm <- hen_data %>% cast_dtm(artUrl, word, count)
inspect(hen_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 14/86
## Sparsity : 86%
## Maximal term length: 3
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 一中 一事 一定
## https://www.ptt.cc/bbs/Gossiping/M.1564714689.A.199.html 2 4 3
## https://www.ptt.cc/bbs/Gossiping/M.1564719641.A.3EC.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564719884.A.6E6.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564720014.A.F13.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564721383.A.B6D.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564723529.A.E0E.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564723678.A.3D5.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564724936.A.25B.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564726380.A.4B0.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564727096.A.A44.html 0 0 1
## Terms
## Docs 人選 力量
## https://www.ptt.cc/bbs/Gossiping/M.1564714689.A.199.html 1 2
## https://www.ptt.cc/bbs/Gossiping/M.1564719641.A.3EC.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564719884.A.6E6.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564720014.A.F13.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564721383.A.B6D.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564723529.A.E0E.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564723678.A.3D5.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564724936.A.25B.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564726380.A.4B0.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564727096.A.A44.html 0 0
## Terms
## Docs 十幾萬 上午
## https://www.ptt.cc/bbs/Gossiping/M.1564714689.A.199.html 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1564719641.A.3EC.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564719884.A.6E6.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564720014.A.F13.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564721383.A.B6D.html 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1564723529.A.E0E.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564723678.A.3D5.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564724936.A.25B.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564726380.A.4B0.html 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1564727096.A.A44.html 0 0
## Terms
## Docs 上天 口號 不宜
## https://www.ptt.cc/bbs/Gossiping/M.1564714689.A.199.html 3 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1564719641.A.3EC.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564719884.A.6E6.html 0 2 0
## https://www.ptt.cc/bbs/Gossiping/M.1564720014.A.F13.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564721383.A.B6D.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564723529.A.E0E.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564723678.A.3D5.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564724936.A.25B.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564726380.A.4B0.html 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1564727096.A.A44.html 0 0 0
#以蔡英文及韓國瑜的資料分別整理出每一個Topic中生成概率最高的10個詞彙。
# ldas_t = c()
# topics = c(2,3,5,6,10,15)
# for(topic in topics){
# start_time <- Sys.time()
# lda_t <- LDA(tsai_dtm, k = topic, control = list(seed = 2020))
# ldas_t =c(ldas_t,lda_t)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas_t,file = "ldas_result_t")
# }
# ldas_h = c()
# topics = c(2,3,5,6,10,15)
# for(topic in topics){
# start_time <- Sys.time()
# lda_h <- LDA(hen_dtm, k = topic, control = list(seed = 2020))
# ldas_h =c(ldas_h,lda_h)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas_h,file = "ldas_result_h")
# }
load("ldas_result_t")
load("ldas_result_h")
new_lda_t = ldas_t[[5]] ## 選定topic 為10 的結果
topics_t <- tidy(new_lda_t, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
remove_words = c("蔡英文","韓國瑜","台灣","總統","記者","媒體","表示","我們","完整","沒有","來源","內文","連結","大家","我們","就是","新聞標題","只是","這樣","還是","可以","自己","不是","一定","網址","什麼","很多","備註","如果","所以","知道","現在","一個","覺得","怎麼","新聞", "今天","不會","這個","這種","一樣","因為","對此","報導","署名","這些","應該","只要","然後","不要","出來","但是","你們","蔡總統","根本","結果","問題","民進黨","其實","真的","可能","到底","他們","這麼","國民黨","柯文哲","一堆","一下","不能","是不是","還有","比較","候選人","參選人","只有","一直","已經","時候","看到", "新聞內文", "媒體來源", "完整新聞標題", "完整新聞", "小英", "中國", "支持", "市長")
top_terms_t <- topics_t %>%
filter(!term %in% remove_words)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_t %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip()+
theme(text = element_text(family='STHeitiTC-Light'))
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
topic_name_t = c('經濟', '民調', '呂秀蓮、質疑', '兩岸、國家定位', '韓粉、造勢', '博士學位、論文', '美國', '反送中', '副總統', '時代力量');
tmResult_t <- posterior(new_lda_t)
doc_pro_t <- tmResult_t$topics
dim(doc_pro_t)
## [1] 4881 10
# get document topic proportions
tsai_data_ori <- tsai_data_ori %>%
select(artTitle, artDate, artTime, artUrl, artPoster, artCat)
document_topics_t <- doc_pro_t[tsai_data_ori$artUrl,]
document_topics_t_df =data.frame(document_topics_t)
colnames(document_topics_t_df) = topic_name_t
rownames(document_topics_t_df) = NULL
news_topic_t = cbind(tsai_data_ori, document_topics_t_df)
news_topic_t %>% head(10)
## artTitle artDate artTime
## 1: [新聞]洪慈庸:明確支持蔡英文連任2020 2019-08-01 03:04:10
## 2: [新聞]柯文哲組黨震撼2020?周偉航:蔡英文穩了 2019-08-01 03:37:56
## 3: [新聞]華航贊助蔡英文攝影展?總統府攝影師: 2019-08-01 03:39:07
## 4: [新聞]2020總統大選》時代力量若決議挺蔡英文? 2019-08-01 04:04:25
## 5: [新聞]陸限縮自由行蔡英文批:戰略上很大的錯誤 2019-08-01 04:05:00
## 6: [新聞]快訊/柯文哲組黨 蔡英文籲團結護主權: 2019-08-01 04:14:32
## 7: [新聞]范雲表態:團結抗中保台支持蔡英文 2019-08-01 04:31:15
## 8: Re:[新聞]快訊/柯文哲組黨 蔡英文籲團結護主權: 2019-08-01 04:36:04
## 9: [新聞]時力風暴》新竹市議會黨團:肯定蔡英文 2019-08-01 04:38:14
## 10: Re:[新聞]洪慈庸:明確支持蔡英文連任2020 2019-08-01 04:45:22
## artUrl artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1564657817.A.607.html armorblocks
## 2: https://www.ptt.cc/bbs/Gossiping/M.1564659840.A.CF6.html anz5566
## 3: https://www.ptt.cc/bbs/Gossiping/M.1564659911.A.A2B.html qqq5566
## 4: https://www.ptt.cc/bbs/Gossiping/M.1564661433.A.108.html ueewen
## 5: https://www.ptt.cc/bbs/Gossiping/M.1564661463.A.533.html LIN6627
## 6: https://www.ptt.cc/bbs/Gossiping/M.1564662034.A.C51.html Moogle
## 7: https://www.ptt.cc/bbs/Gossiping/M.1564663040.A.E77.html iasyt
## 8: https://www.ptt.cc/bbs/Gossiping/M.1564663326.A.964.html fifa186
## 9: https://www.ptt.cc/bbs/Gossiping/M.1564663456.A.64B.html whokisswho
## 10: https://www.ptt.cc/bbs/Gossiping/M.1564663885.A.993.html attilalin
## artCat 經濟 民調 呂秀蓮、質疑 兩岸、國家定位
## 1: Gossiping 0.3122954972 0.0001472250 0.0001472250 0.0001472250
## 2: Gossiping 0.0002311591 0.9979195684 0.0002311591 0.0002311591
## 3: Gossiping 0.0003139361 0.0003139361 0.0003139361 0.0003139361
## 4: Gossiping 0.0002201757 0.4003635873 0.4535858036 0.0002201757
## 5: Gossiping 0.0001182441 0.0001182441 0.0001182441 0.0001182441
## 6: Gossiping 0.0001986063 0.0677766623 0.0001986063 0.0001986063
## 7: Gossiping 0.1458583508 0.0264941604 0.0001260559 0.0001260559
## 8: Gossiping 0.0002588264 0.0002588264 0.0002588264 0.2827041312
## 9: Gossiping 0.2637327357 0.0001579805 0.0001579805 0.0001579805
## 10: Gossiping 0.0002651741 0.0398247190 0.0002651741 0.0002651741
## 韓粉、造勢 博士學位、論文 美國 反送中 副總統
## 1: 0.0001472250 0.0001472250 0.5388867555 0.0001472250 0.0001472250
## 2: 0.0002311591 0.0002311591 0.0002311591 0.0002311591 0.0002311591
## 3: 0.0003139361 0.0003139361 0.0003139361 0.5937909092 0.4036976019
## 4: 0.0779124527 0.0002201757 0.0002201757 0.0002201757 0.0002201757
## 5: 0.0001182441 0.0001182441 0.3939173555 0.6051366914 0.0001182441
## 6: 0.0001986063 0.0001986063 0.6838814016 0.0001986063 0.2469516920
## 7: 0.0001260559 0.0001260559 0.2069399225 0.6199512310 0.0001260559
## 8: 0.0002588264 0.0002588264 0.1064141294 0.0002588264 0.0002588264
## 9: 0.0001579805 0.0001579805 0.6300887406 0.0001579805 0.0001579805
## 10: 0.0002651741 0.0002651741 0.4605384347 0.0002651741 0.0002651741
## 時代力量
## 1: 0.1477871724
## 2: 0.0002311591
## 3: 0.0003139361
## 4: 0.0668171020
## 5: 0.0001182441
## 6: 0.0001986063
## 7: 0.0001260559
## 8: 0.6090699549
## 9: 0.1050726601
## 10: 0.4977806278
news_topic_t %>%
arrange(desc(`經濟`))%>%head(10)
#news_topic_t[,c(7:16)] =sapply(news_topic_t[,c(7:16)] , as.numeric)
news_topic_t %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(text = element_text(family='STHeitiTC-Light'))
## 去除主題為None1, None2, None3
news_topic_t %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has
## been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is
## now deprecated as well. To continue using melt methods from reshape2 while
## both libraries are attached, e.g. melt.list, you can prepend the namespace
## like reshape2::melt(.). In the next version, this warning will become an
## error.
news_topic_t %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(text = element_text(family='STHeitiTC-Light'))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has
## been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is
## now deprecated as well. To continue using melt methods from reshape2 while
## both libraries are attached, e.g. melt.list, you can prepend the namespace
## like reshape2::melt(.). In the next version, this warning will become an
## error.
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
new_lda_h = ldas_h[[5]] ## 選定topic 為10 的結果
topics_h <- tidy(new_lda_h, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics_h
## # A tibble: 674,460 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 安排 0.00000000913
## 2 2 安排 0.00000405
## 3 3 安排 0.000157
## 4 4 安排 0.000637
## 5 5 安排 0.000135
## 6 6 安排 0.000973
## 7 7 安排 0.0000384
## 8 8 安排 0.000163
## 9 9 安排 0.00152
## 10 10 安排 0.0000217
## # ... with 674,450 more rows
remove_words_h = c("高雄", "高雄市", "市長","蔡英文","韓國瑜","台灣","總統","記者","媒體","表示","我們","完整","沒有","來源","內文","連結","大家","我們","就是","新聞標題","只是","這樣","還是","可以","自己","不是","一定","網址","什麼","很多","備註","如果","所以","知道","現在","一個","覺得","怎麼","新聞", "今天","不會","這個","這種","一樣","因為","對此","報導","署名","這些","應該","只要","然後","不要","出來","但是","你們","蔡總統","根本","結果","問題","民進黨","其實","真的","可能","到底","他們","這麼","國民黨","柯文哲","一堆","一下","不能","是不是","還有","比較","候選人","參選人","只有","一直","已經","時候","看到", "那麼", "直接", "對於", "現場", "中國", "美國", "支持", "新聞內文", "媒體來源", "完整新聞標題", "完整新聞", "小英", "中國", "支持", "市長")
top_terms_h <- topics_h %>%
filter(!term %in% remove_words_h)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_h %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip() +
theme(text = element_text(family='STHeitiTC-Light'))
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
topic_name_h = c('民調', '黃國昌、砂石', '國家機器、豪宅', '日本學者、遲到', '兩岸', '造勢遊行', 'None', '愛情摩天輪', '新北、造勢', '庶民、草包')
tmResult_h <- posterior(new_lda_h)
doc_pro_h <- tmResult_h$topics
dim(doc_pro_h)
## [1] 6839 10
# get document topic proportions
hen_data_ori <- hen_data_ori %>%
select(artTitle, artDate, artTime, artUrl, artPoster, artCat)
document_topics_h <- doc_pro_h[hen_data_ori$artUrl,]
document_topics_h_df =data.frame(document_topics_h)
colnames(document_topics_h_df) = topic_name_h
rownames(document_topics_h_df) = NULL
news_topic_h = cbind(hen_data_ori, document_topics_h_df)
news_topic_h %>% head(10)
## artTitle artDate artTime
## 1: [新聞]王金平拒當韓國瑜副手原因曝光! 「一定 2019-08-01 18:52:07
## 2: Re:[新聞]王金平拒當韓國瑜副手原因曝光! 「一定 2019-08-01 20:14:37
## 3: [新聞]愛情摩天輪被疑跳票韓國瑜又火了:何時 2019-08-01 20:18:40
## 4: Re:[新聞]愛情摩天輪被疑跳票韓國瑜又火了:何時 2019-08-01 20:20:52
## 5: [新聞]「鎂光燈閃眼睛很不舒服」 韓國瑜火氣大 2019-08-01 20:43:40
## 6: [新聞]愛情摩天輪8月要蓋..又沒了?韓國瑜:我 2019-08-01 21:19:25
## 7: [爆卦]中天、中時極力抹黑唱衰韓國瑜 2019-08-01 21:21:55
## 8: Re:[新聞]愛情摩天輪被疑跳票韓國瑜又火了:何時 2019-08-01 21:42:54
## 9: [新聞]尷尬!韓國瑜表揚模範父親 匾額突墜地摔 2019-08-01 22:06:56
## 10: Re:[新聞]「鎂光燈閃眼睛很不舒服」 韓國瑜火氣大 2019-08-01 22:18:51
## artUrl artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1564714689.A.199.html yenkin
## 2: https://www.ptt.cc/bbs/Gossiping/M.1564719641.A.3EC.html s910443tw
## 3: https://www.ptt.cc/bbs/Gossiping/M.1564719884.A.6E6.html derekgao
## 4: https://www.ptt.cc/bbs/Gossiping/M.1564720014.A.F13.html jacklyl
## 5: https://www.ptt.cc/bbs/Gossiping/M.1564721383.A.B6D.html jiunyee
## 6: https://www.ptt.cc/bbs/Gossiping/M.1564723529.A.E0E.html xamous
## 7: https://www.ptt.cc/bbs/Gossiping/M.1564723678.A.3D5.html btm978952
## 8: https://www.ptt.cc/bbs/Gossiping/M.1564724936.A.25B.html Bluce
## 9: https://www.ptt.cc/bbs/Gossiping/M.1564726380.A.4B0.html meiyouo
## 10: https://www.ptt.cc/bbs/Gossiping/M.1564727096.A.A44.html shokotan
## artCat 民調 黃國昌、砂石 國家機器、豪宅 日本學者、遲到
## 1: Gossiping 0.0002055469 0.0002055469 0.0002055469 0.0002055469
## 2: Gossiping 0.0011753340 0.0011753340 0.0011753340 0.0011753340
## 3: Gossiping 0.0004219897 0.0004219897 0.0004219897 0.0004219897
## 4: Gossiping 0.0022523911 0.0022523911 0.0022523911 0.0022523911
## 5: Gossiping 0.0003114733 0.0003114733 0.0003114733 0.0003114733
## 6: Gossiping 0.0002229484 0.0002229484 0.1858169637 0.0002229484
## 7: Gossiping 0.0019035201 0.6219264607 0.0019035201 0.0019035201
## 8: Gossiping 0.0006908299 0.0006908299 0.0657089422 0.5411732386
## 9: Gossiping 0.0002284748 0.0002284748 0.0002284748 0.0002284748
## 10: Gossiping 0.0004704517 0.0004704517 0.1009497197 0.0423990497
## 兩岸 造勢遊行 None 愛情摩天輪 新北、造勢
## 1: 0.0002055469 0.0002055469 0.0002055469 0.0002055469 0.8632137425
## 2: 0.0011753340 0.0011753340 0.0011753340 0.0011753340 0.0011753340
## 3: 0.0004219897 0.3902069505 0.0004219897 0.6064171320 0.0004219897
## 4: 0.0022523911 0.7511070630 0.0022523911 0.2308738079 0.0022523911
## 5: 0.0003114733 0.9015488383 0.0003114733 0.0003114733 0.0959593754
## 6: 0.0002229484 0.2562265348 0.0002229484 0.5563958627 0.0002229484
## 7: 0.0019035201 0.0019035201 0.3628453784 0.0019035201 0.0019035201
## 8: 0.0006908299 0.1513105740 0.0006908299 0.0006908299 0.0006908299
## 9: 0.0002284748 0.4766214571 0.0002284748 0.0002284748 0.5215507441
## 10: 0.0004704517 0.0004704517 0.0004704517 0.0004704517 0.6081083117
## 庶民、草包
## 1: 0.1351418819
## 2: 0.9894219937
## 3: 0.0004219897
## 4: 0.0022523911
## 5: 0.0003114733
## 6: 0.0002229484
## 7: 0.0019035201
## 8: 0.2376622658
## 9: 0.0002284748
## 10: 0.2457202084
news_topic_h %>%
arrange(desc(`愛情摩天輪`))%>%head(10)
#news_topic_h[,c(7:16)] =sapply(news_topic_h[,c(7:16)] , as.numeric)
news_topic_h %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(text = element_text(family='STHeitiTC-Light'))
news_topic_h %>%
dplyr::select(-None)%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has
## been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is
## now deprecated as well. To continue using melt methods from reshape2 while
## both libraries are attached, e.g. melt.list, you can prepend the namespace
## like reshape2::melt(.). In the next version, this warning will become an
## error.
news_topic_h %>%
dplyr::select(-None)%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has
## been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is
## now deprecated as well. To continue using melt methods from reshape2 while
## both libraries are attached, e.g. melt.list, you can prepend the namespace
## like reshape2::melt(.). In the next version, this warning will become an
## error.