###LDA分析

require(readr)
require(tm)
require(data.table)
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
require(LDAvis)
require(wordcloud2)
require(webshot)
require(htmlwidgets)
require(servr)
require(purrr)
require(ramify)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

PTT八卦版2019.7.15~2020.1.10關於總統大選的文章

tsai_data <- fread("蔡英文_articleMetaData.csv", encoding = "UTF-8")
hen_data <- fread("韓國瑜_articleMetaData.csv", encoding = "UTF-8")

tsai_data$artDate <- tsai_data$artDate %>% as.Date("%Y/%m/%d")
hen_data$artDate <- hen_data$artDate %>% as.Date("%Y/%m/%d")

tsai_data_ori = tsai_data
hen_data_ori = hen_data
total = rbind(tsai_data,hen_data)
total$artDate = as.Date(total$artDate)
total %>% 
  group_by(artDate) %>%
  summarise(count = n())%>%
  ggplot(aes(artDate,count))+
    geom_line(color="red")+
  geom_point()

>2020年1月11日的貼文數與留言數最多

設定斷詞

#jieba_tokenizer <- worker()
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
#new_user_word(jieba_tokenizer, user_dict)
data_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    res <- filter_segment(tokens, stop_words)
    return(res)
  })
}

tsai_data <- tsai_data %>%
  unnest_tokens(word, sentence, data_tokenizer)
hen_data <- hen_data %>%
  unnest_tokens(word, sentence, data_tokenizer)

tsai_data <- tsai_data %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  group_by(artUrl,word) %>%
  summarise(
    count = n()
  )%>%
  filter(nchar(word)>1)

hen_data <- hen_data %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  group_by(artUrl,word) %>%
  summarise(
    count = n()
  )%>%
  filter(nchar(word)>1)

將資料轉換為Document Term Matrix (DTM)

tsai_dtm <- tsai_data %>% cast_dtm(artUrl, word, count)
inspect(tsai_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 14/86
## Sparsity           : 86%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## Sample             :
##                                                           Terms
## Docs                                                       一再 一致 一席
##   https://www.ptt.cc/bbs/Gossiping/M.1564657817.A.607.html    1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1564659840.A.CF6.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564659911.A.A2B.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564661433.A.108.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564661463.A.533.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564662034.A.C51.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663040.A.E77.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663326.A.964.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663456.A.64B.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663885.A.993.html    0    0    0
##                                                           Terms
## Docs                                                       一條 力量 大選
##   https://www.ptt.cc/bbs/Gossiping/M.1564657817.A.607.html    1    3    1
##   https://www.ptt.cc/bbs/Gossiping/M.1564659840.A.CF6.html    0    0    1
##   https://www.ptt.cc/bbs/Gossiping/M.1564659911.A.A2B.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564661433.A.108.html    0    0    2
##   https://www.ptt.cc/bbs/Gossiping/M.1564661463.A.533.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564662034.A.C51.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663040.A.E77.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663326.A.964.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663456.A.64B.html    0    2    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663885.A.993.html    0    0    0
##                                                           Terms
## Docs                                                       不利 不夠 之前
##   https://www.ptt.cc/bbs/Gossiping/M.1564657817.A.607.html    1    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1564659840.A.CF6.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564659911.A.A2B.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564661433.A.108.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564661463.A.533.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564662034.A.C51.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663040.A.E77.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663326.A.964.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663456.A.64B.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663885.A.993.html    0    0    1
##                                                           Terms
## Docs                                                       之間
##   https://www.ptt.cc/bbs/Gossiping/M.1564657817.A.607.html    1
##   https://www.ptt.cc/bbs/Gossiping/M.1564659840.A.CF6.html    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564659911.A.A2B.html    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564661433.A.108.html    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564661463.A.533.html    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564662034.A.C51.html    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663040.A.E77.html    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663326.A.964.html    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663456.A.64B.html    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564663885.A.993.html    0
hen_dtm <- hen_data %>% cast_dtm(artUrl, word, count)
inspect(hen_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 14/86
## Sparsity           : 86%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##                                                           Terms
## Docs                                                       一中 一事 一定
##   https://www.ptt.cc/bbs/Gossiping/M.1564714689.A.199.html    2    4    3
##   https://www.ptt.cc/bbs/Gossiping/M.1564719641.A.3EC.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564719884.A.6E6.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564720014.A.F13.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564721383.A.B6D.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564723529.A.E0E.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564723678.A.3D5.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564724936.A.25B.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564726380.A.4B0.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564727096.A.A44.html    0    0    1
##                                                           Terms
## Docs                                                       人選 力量
##   https://www.ptt.cc/bbs/Gossiping/M.1564714689.A.199.html    1    2
##   https://www.ptt.cc/bbs/Gossiping/M.1564719641.A.3EC.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564719884.A.6E6.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564720014.A.F13.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564721383.A.B6D.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564723529.A.E0E.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564723678.A.3D5.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564724936.A.25B.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564726380.A.4B0.html    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564727096.A.A44.html    0    0
##                                                           Terms
## Docs                                                       十幾萬 上午
##   https://www.ptt.cc/bbs/Gossiping/M.1564714689.A.199.html      1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1564719641.A.3EC.html      0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564719884.A.6E6.html      0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564720014.A.F13.html      0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564721383.A.B6D.html      0    1
##   https://www.ptt.cc/bbs/Gossiping/M.1564723529.A.E0E.html      0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564723678.A.3D5.html      0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564724936.A.25B.html      0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564726380.A.4B0.html      0    1
##   https://www.ptt.cc/bbs/Gossiping/M.1564727096.A.A44.html      0    0
##                                                           Terms
## Docs                                                       上天 口號 不宜
##   https://www.ptt.cc/bbs/Gossiping/M.1564714689.A.199.html    3    1    1
##   https://www.ptt.cc/bbs/Gossiping/M.1564719641.A.3EC.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564719884.A.6E6.html    0    2    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564720014.A.F13.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564721383.A.B6D.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564723529.A.E0E.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564723678.A.3D5.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564724936.A.25B.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564726380.A.4B0.html    0    0    0
##   https://www.ptt.cc/bbs/Gossiping/M.1564727096.A.A44.html    0    0    0

#以蔡英文及韓國瑜的資料分別整理出每一個Topic中生成概率最高的10個詞彙。

# ldas_t = c()
# topics = c(2,3,5,6,10,15)
# for(topic in topics){
#   start_time <- Sys.time()
#   lda_t <- LDA(tsai_dtm, k = topic, control = list(seed = 2020))
#   ldas_t =c(ldas_t,lda_t)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(ldas_t,file = "ldas_result_t")
# }

# ldas_h = c()
# topics = c(2,3,5,6,10,15)
# for(topic in topics){
#   start_time <- Sys.time()
#   lda_h <- LDA(hen_dtm, k = topic, control = list(seed = 2020))
#   ldas_h =c(ldas_h,lda_h)
#   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
#   save(ldas_h,file = "ldas_result_h")
# }

載入每個主題的LDA結果

load("ldas_result_t")
load("ldas_result_h")

透過perplexity找到最佳主題數,選定topic 為10 的結果

蔡英文

new_lda_t = ldas_t[[5]] ## 選定topic 為10 的結果

topics_t <- tidy(new_lda_t, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。

去除掉蔡英文跨主題共享詞彙

remove_words = c("蔡英文","韓國瑜","台灣","總統","記者","媒體","表示","我們","完整","沒有","來源","內文","連結","大家","我們","就是","新聞標題","只是","這樣","還是","可以","自己","不是","一定","網址","什麼","很多","備註","如果","所以","知道","現在","一個","覺得","怎麼","新聞", "今天","不會","這個","這種","一樣","因為","對此","報導","署名","這些","應該","只要","然後","不要","出來","但是","你們","蔡總統","根本","結果","問題","民進黨","其實","真的","可能","到底","他們","這麼","國民黨","柯文哲","一堆","一下","不能","是不是","還有","比較","候選人","參選人","只有","一直","已經","時候","看到", "新聞內文", "媒體來源", "完整新聞標題", "完整新聞", "小英", "中國", "支持", "市長")
top_terms_t <- topics_t %>%
  filter(!term  %in% remove_words)%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms_t %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  coord_flip()+
  theme(text = element_text(family='STHeitiTC-Light'))
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

蔡英文的主題

topic_name_t = c('經濟', '民調', '呂秀蓮、質疑', '兩岸、國家定位', '韓粉、造勢', '博士學位、論文', '美國', '反送中', '副總統', '時代力量');
tmResult_t <- posterior(new_lda_t)
doc_pro_t <- tmResult_t$topics 
dim(doc_pro_t)  
## [1] 4881   10
# get document topic proportions
tsai_data_ori <- tsai_data_ori %>%
  select(artTitle, artDate, artTime, artUrl, artPoster, artCat)
document_topics_t <- doc_pro_t[tsai_data_ori$artUrl,]
document_topics_t_df =data.frame(document_topics_t)
colnames(document_topics_t_df) = topic_name_t
rownames(document_topics_t_df) = NULL
news_topic_t = cbind(tsai_data_ori, document_topics_t_df)
news_topic_t %>% head(10)
##                                            artTitle    artDate  artTime
##  1:            [新聞]洪慈庸:明確支持蔡英文連任2020 2019-08-01 03:04:10
##  2:    [新聞]柯文哲組黨震撼2020?周偉航:蔡英文穩了 2019-08-01 03:37:56
##  3:      [新聞]華航贊助蔡英文攝影展?總統府攝影師: 2019-08-01 03:39:07
##  4:    [新聞]2020總統大選》時代力量若決議挺蔡英文? 2019-08-01 04:04:25
##  5:    [新聞]陸限縮自由行蔡英文批:戰略上很大的錯誤 2019-08-01 04:05:00
##  6:    [新聞]快訊/柯文哲組黨 蔡英文籲團結護主權: 2019-08-01 04:14:32
##  7:          [新聞]范雲表態:團結抗中保台支持蔡英文 2019-08-01 04:31:15
##  8: Re:[新聞]快訊/柯文哲組黨 蔡英文籲團結護主權: 2019-08-01 04:36:04
##  9:      [新聞]時力風暴》新竹市議會黨團:肯定蔡英文 2019-08-01 04:38:14
## 10:         Re:[新聞]洪慈庸:明確支持蔡英文連任2020 2019-08-01 04:45:22
##                                                       artUrl   artPoster
##  1: https://www.ptt.cc/bbs/Gossiping/M.1564657817.A.607.html armorblocks
##  2: https://www.ptt.cc/bbs/Gossiping/M.1564659840.A.CF6.html     anz5566
##  3: https://www.ptt.cc/bbs/Gossiping/M.1564659911.A.A2B.html     qqq5566
##  4: https://www.ptt.cc/bbs/Gossiping/M.1564661433.A.108.html      ueewen
##  5: https://www.ptt.cc/bbs/Gossiping/M.1564661463.A.533.html     LIN6627
##  6: https://www.ptt.cc/bbs/Gossiping/M.1564662034.A.C51.html      Moogle
##  7: https://www.ptt.cc/bbs/Gossiping/M.1564663040.A.E77.html       iasyt
##  8: https://www.ptt.cc/bbs/Gossiping/M.1564663326.A.964.html     fifa186
##  9: https://www.ptt.cc/bbs/Gossiping/M.1564663456.A.64B.html  whokisswho
## 10: https://www.ptt.cc/bbs/Gossiping/M.1564663885.A.993.html   attilalin
##        artCat         經濟         民調 呂秀蓮、質疑 兩岸、國家定位
##  1: Gossiping 0.3122954972 0.0001472250 0.0001472250   0.0001472250
##  2: Gossiping 0.0002311591 0.9979195684 0.0002311591   0.0002311591
##  3: Gossiping 0.0003139361 0.0003139361 0.0003139361   0.0003139361
##  4: Gossiping 0.0002201757 0.4003635873 0.4535858036   0.0002201757
##  5: Gossiping 0.0001182441 0.0001182441 0.0001182441   0.0001182441
##  6: Gossiping 0.0001986063 0.0677766623 0.0001986063   0.0001986063
##  7: Gossiping 0.1458583508 0.0264941604 0.0001260559   0.0001260559
##  8: Gossiping 0.0002588264 0.0002588264 0.0002588264   0.2827041312
##  9: Gossiping 0.2637327357 0.0001579805 0.0001579805   0.0001579805
## 10: Gossiping 0.0002651741 0.0398247190 0.0002651741   0.0002651741
##       韓粉、造勢 博士學位、論文         美國       反送中       副總統
##  1: 0.0001472250   0.0001472250 0.5388867555 0.0001472250 0.0001472250
##  2: 0.0002311591   0.0002311591 0.0002311591 0.0002311591 0.0002311591
##  3: 0.0003139361   0.0003139361 0.0003139361 0.5937909092 0.4036976019
##  4: 0.0779124527   0.0002201757 0.0002201757 0.0002201757 0.0002201757
##  5: 0.0001182441   0.0001182441 0.3939173555 0.6051366914 0.0001182441
##  6: 0.0001986063   0.0001986063 0.6838814016 0.0001986063 0.2469516920
##  7: 0.0001260559   0.0001260559 0.2069399225 0.6199512310 0.0001260559
##  8: 0.0002588264   0.0002588264 0.1064141294 0.0002588264 0.0002588264
##  9: 0.0001579805   0.0001579805 0.6300887406 0.0001579805 0.0001579805
## 10: 0.0002651741   0.0002651741 0.4605384347 0.0002651741 0.0002651741
##         時代力量
##  1: 0.1477871724
##  2: 0.0002311591
##  3: 0.0003139361
##  4: 0.0668171020
##  5: 0.0001182441
##  6: 0.0001986063
##  7: 0.0001260559
##  8: 0.6090699549
##  9: 0.1050726601
## 10: 0.4977806278

查看特定主題

news_topic_t %>%
  arrange(desc(`經濟`))%>%head(10)

了解蔡英文的主題在時間的變化

#news_topic_t[,c(7:16)] =sapply(news_topic_t[,c(7:16)] , as.numeric)

news_topic_t %>% 
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(text = element_text(family='STHeitiTC-Light'))

## 去除主題為None1, None2, None3

news_topic_t %>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has
## been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is
## now deprecated as well. To continue using melt methods from reshape2 while
## both libraries are attached, e.g. melt.list, you can prepend the namespace
## like reshape2::melt(.). In the next version, this warning will become an
## error.

以比例了解蔡英文的主題時間變化

news_topic_t %>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(text = element_text(family='STHeitiTC-Light'))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has
## been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is
## now deprecated as well. To continue using melt methods from reshape2 while
## both libraries are attached, e.g. melt.list, you can prepend the namespace
## like reshape2::melt(.). In the next version, this warning will become an
## error.
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

韓國瑜

new_lda_h = ldas_h[[5]] ## 選定topic 為10 的結果

topics_h <- tidy(new_lda_h, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics_h
## # A tibble: 674,460 x 3
##    topic term           beta
##    <int> <chr>         <dbl>
##  1     1 安排  0.00000000913
##  2     2 安排  0.00000405   
##  3     3 安排  0.000157     
##  4     4 安排  0.000637     
##  5     5 安排  0.000135     
##  6     6 安排  0.000973     
##  7     7 安排  0.0000384    
##  8     8 安排  0.000163     
##  9     9 安排  0.00152      
## 10    10 安排  0.0000217    
## # ... with 674,450 more rows
remove_words_h = c("高雄", "高雄市", "市長","蔡英文","韓國瑜","台灣","總統","記者","媒體","表示","我們","完整","沒有","來源","內文","連結","大家","我們","就是","新聞標題","只是","這樣","還是","可以","自己","不是","一定","網址","什麼","很多","備註","如果","所以","知道","現在","一個","覺得","怎麼","新聞", "今天","不會","這個","這種","一樣","因為","對此","報導","署名","這些","應該","只要","然後","不要","出來","但是","你們","蔡總統","根本","結果","問題","民進黨","其實","真的","可能","到底","他們","這麼","國民黨","柯文哲","一堆","一下","不能","是不是","還有","比較","候選人","參選人","只有","一直","已經","時候","看到", "那麼", "直接", "對於", "現場", "中國", "美國", "支持", "新聞內文", "媒體來源", "完整新聞標題", "完整新聞", "小英", "中國", "支持", "市長")

top_terms_h <- topics_h %>%
  filter(!term  %in% remove_words_h)%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms_h %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text = element_text(family='STHeitiTC-Light'))
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

韓國瑜的主題

topic_name_h = c('民調', '黃國昌、砂石', '國家機器、豪宅', '日本學者、遲到', '兩岸', '造勢遊行', 'None', '愛情摩天輪', '新北、造勢', '庶民、草包')
tmResult_h <- posterior(new_lda_h)
doc_pro_h <- tmResult_h$topics 
dim(doc_pro_h)  
## [1] 6839   10
# get document topic proportions
hen_data_ori <- hen_data_ori %>%
  select(artTitle, artDate, artTime, artUrl, artPoster, artCat)
document_topics_h <- doc_pro_h[hen_data_ori$artUrl,]
document_topics_h_df =data.frame(document_topics_h)
colnames(document_topics_h_df) = topic_name_h
rownames(document_topics_h_df) = NULL
news_topic_h = cbind(hen_data_ori, document_topics_h_df)
news_topic_h %>% head(10)
##                                            artTitle    artDate  artTime
##  1:    [新聞]王金平拒當韓國瑜副手原因曝光! 「一定 2019-08-01 18:52:07
##  2: Re:[新聞]王金平拒當韓國瑜副手原因曝光! 「一定 2019-08-01 20:14:37
##  3:      [新聞]愛情摩天輪被疑跳票韓國瑜又火了:何時 2019-08-01 20:18:40
##  4:   Re:[新聞]愛情摩天輪被疑跳票韓國瑜又火了:何時 2019-08-01 20:20:52
##  5:    [新聞]「鎂光燈閃眼睛很不舒服」 韓國瑜火氣大 2019-08-01 20:43:40
##  6:     [新聞]愛情摩天輪8月要蓋..又沒了?韓國瑜:我 2019-08-01 21:19:25
##  7:              [爆卦]中天、中時極力抹黑唱衰韓國瑜 2019-08-01 21:21:55
##  8:   Re:[新聞]愛情摩天輪被疑跳票韓國瑜又火了:何時 2019-08-01 21:42:54
##  9:    [新聞]尷尬!韓國瑜表揚模範父親 匾額突墜地摔 2019-08-01 22:06:56
## 10: Re:[新聞]「鎂光燈閃眼睛很不舒服」 韓國瑜火氣大 2019-08-01 22:18:51
##                                                       artUrl artPoster
##  1: https://www.ptt.cc/bbs/Gossiping/M.1564714689.A.199.html    yenkin
##  2: https://www.ptt.cc/bbs/Gossiping/M.1564719641.A.3EC.html s910443tw
##  3: https://www.ptt.cc/bbs/Gossiping/M.1564719884.A.6E6.html  derekgao
##  4: https://www.ptt.cc/bbs/Gossiping/M.1564720014.A.F13.html   jacklyl
##  5: https://www.ptt.cc/bbs/Gossiping/M.1564721383.A.B6D.html   jiunyee
##  6: https://www.ptt.cc/bbs/Gossiping/M.1564723529.A.E0E.html    xamous
##  7: https://www.ptt.cc/bbs/Gossiping/M.1564723678.A.3D5.html btm978952
##  8: https://www.ptt.cc/bbs/Gossiping/M.1564724936.A.25B.html     Bluce
##  9: https://www.ptt.cc/bbs/Gossiping/M.1564726380.A.4B0.html   meiyouo
## 10: https://www.ptt.cc/bbs/Gossiping/M.1564727096.A.A44.html  shokotan
##        artCat         民調 黃國昌、砂石 國家機器、豪宅 日本學者、遲到
##  1: Gossiping 0.0002055469 0.0002055469   0.0002055469   0.0002055469
##  2: Gossiping 0.0011753340 0.0011753340   0.0011753340   0.0011753340
##  3: Gossiping 0.0004219897 0.0004219897   0.0004219897   0.0004219897
##  4: Gossiping 0.0022523911 0.0022523911   0.0022523911   0.0022523911
##  5: Gossiping 0.0003114733 0.0003114733   0.0003114733   0.0003114733
##  6: Gossiping 0.0002229484 0.0002229484   0.1858169637   0.0002229484
##  7: Gossiping 0.0019035201 0.6219264607   0.0019035201   0.0019035201
##  8: Gossiping 0.0006908299 0.0006908299   0.0657089422   0.5411732386
##  9: Gossiping 0.0002284748 0.0002284748   0.0002284748   0.0002284748
## 10: Gossiping 0.0004704517 0.0004704517   0.1009497197   0.0423990497
##             兩岸     造勢遊行         None   愛情摩天輪   新北、造勢
##  1: 0.0002055469 0.0002055469 0.0002055469 0.0002055469 0.8632137425
##  2: 0.0011753340 0.0011753340 0.0011753340 0.0011753340 0.0011753340
##  3: 0.0004219897 0.3902069505 0.0004219897 0.6064171320 0.0004219897
##  4: 0.0022523911 0.7511070630 0.0022523911 0.2308738079 0.0022523911
##  5: 0.0003114733 0.9015488383 0.0003114733 0.0003114733 0.0959593754
##  6: 0.0002229484 0.2562265348 0.0002229484 0.5563958627 0.0002229484
##  7: 0.0019035201 0.0019035201 0.3628453784 0.0019035201 0.0019035201
##  8: 0.0006908299 0.1513105740 0.0006908299 0.0006908299 0.0006908299
##  9: 0.0002284748 0.4766214571 0.0002284748 0.0002284748 0.5215507441
## 10: 0.0004704517 0.0004704517 0.0004704517 0.0004704517 0.6081083117
##       庶民、草包
##  1: 0.1351418819
##  2: 0.9894219937
##  3: 0.0004219897
##  4: 0.0022523911
##  5: 0.0003114733
##  6: 0.0002229484
##  7: 0.0019035201
##  8: 0.2376622658
##  9: 0.0002284748
## 10: 0.2457202084

查看特定主題

news_topic_h %>%
  arrange(desc(`愛情摩天輪`))%>%head(10)

了解韓國瑜的主題在時間的變化

#news_topic_h[,c(7:16)] =sapply(news_topic_h[,c(7:16)] , as.numeric)

news_topic_h %>% 
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(text = element_text(family='STHeitiTC-Light'))

去除主題為None

news_topic_h %>%
  dplyr::select(-None)%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has
## been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is
## now deprecated as well. To continue using melt methods from reshape2 while
## both libraries are attached, e.g. melt.list, you can prepend the namespace
## like reshape2::melt(.). In the next version, this warning will become an
## error.

以比例了解韓國瑜的主題時間變化

news_topic_h %>%
  dplyr::select(-None)%>%
  group_by(artDate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
  ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning in melt(., id.vars = "artDate"): The melt generic in data.table has
## been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is
## now deprecated as well. To continue using melt methods from reshape2 while
## both libraries are attached, e.g. melt.list, you can prepend the namespace
## like reshape2::melt(.). In the next version, this warning will become an
## error.