系統設置

#Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # For ubuntu
Sys.setlocale("LC_CTYPE", "cht") # For windows.

## [1] "Chinese (Traditional)_Taiwan.950"

安裝需要的packages

packages = c("readr","tm", "data.table", "dplyr", "stringr","tidyverse", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot","purrr","ramify","RColorBrewer","wordcloud2", "htmlwidgets","servr","scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(pacman)

require(readr)
require(tm)
require(data.table)
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
require(LDAvis)
require(wordcloud2)
require(webshot)
require(htmlwidgets)
require(servr)
require(purrr)
require(ramify)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

讀取批踢踢檔案

fish_hatePolitics <- fread(file="rawdata.csv",header = T,stringsAsFactors = F,encoding = "UTF-8",data.table = FALSE )

轉換日期格式

fish_hatePolitics$artDate= fish_hatePolitics$artDate %>% as.Date("%Y/%m/%d")

資料描述

2019年4月23日，高雄市市長韓國瑜宣布參選2020總統大選。
透過中山管院文字分析平台在批踢踢的HatePolitics看板，搜尋關鍵字：[韓國瑜、韓國魚、霸韓、韓導、韓總]，時間從2019-04-23~2020-04-22，為期一年的資料，總篇數為23735篇。

資料預覽

fish_hatePolitics %>% 
  mutate(year = year(artDate), month = month(artDate)) %>%
  group_by(year,month) %>%
  summarise(count = n())%>%
  ggplot(aes(x=month,y=count))+
  geom_bar(stat="identity")+
  facet_grid(~year)

1.可以觀察到資料主要分佈去年5-6月以及12月，今年討論聲量則明顯下降。
2.【聲量高點原因推測】
(1)2019年5-6月：韓國瑜剛宣布要參選總統，引發諸如“烙跑市長”、“選上總統，高雄上班”等議題。
(2)2020年12月：鄰近一月中的總統大選，正值“競選活動”與“總統辯論會”的高峰期。

Tokenization

初始化斷詞引擎，並加入停用字

jieba_tokenizer = worker(stop_word = "stop_words.txt")
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")

自定義斷詞

new_user_word(jieba_tokenizer,c("韓國瑜","韓國魚","不支持","柯文哲","九二共識","競選活動","罷韓遊行","扶不起的阿斗","高雄市市府","高雄市市長","造勢活動","蔡英文","時代力量","黃國昌","支持罷韓","支持罷免","光復高雄"))

## [1] TRUE

去掉字串長度爲1的詞彙

fish_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1] 
      return(tokens)
    }
  })
}

過濾特殊字元

tokens <- fish_hatePolitics %>%
  mutate(id=c(1:nrow(fish_hatePolitics))) %>%
  unnest_tokens(word, sentence, token=fish_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]")))

把名稱統一

tokens$word[which(tokens$word == c("韓國魚","韓董","韓導","韓總"))] = "韓國瑜"

## Warning in tokens$word == c("韓國魚", "韓董", "韓導", "韓總"): 較長的物件長度並
## 非較短物件長度的倍數

tokens$word[which(tokens$word == "郭董")] = "郭台銘"
tokens$word[which(tokens$word == c("柯p","柯P"))] = "柯文哲"

## Warning in tokens$word == c("柯p", "柯P"): 較長的物件長度並非較短物件長度的倍數

tokens$word[which(tokens$word == "小英")] = "蔡英文"

查看前20筆資料

tokens %>% head(20)

##                    artTitle    artDate  artTime
## 1  [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 2  [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 3  [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 4  [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 5  [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 6  [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 7  [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 8  [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 9  [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 10 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 11 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 12 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 13 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 14 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 15 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 16 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 17 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 18 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 19 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
## 20 [討論]還以為韓總是聰明人 2019-04-23 10:24:28
##                                                         artUrl artPoster
## 1  https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 2  https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 3  https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 4  https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 5  https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 6  https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 7  https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 8  https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 9  https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 10 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 11 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 12 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 13 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 14 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 15 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 16 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 17 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 18 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 19 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
## 20 https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html   WER0930
##          artCat commentNum push boo id     word
## 1  HatePolitics         84   18   3  1     以為
## 2  HatePolitics         84   18   3  1   韓國瑜
## 3  HatePolitics         84   18   3  1     兵法
## 4  HatePolitics         84   18   3  1     高雄
## 5  HatePolitics         84   18   3  1     這局
## 6  HatePolitics         84   18   3  1     棋中
## 7  HatePolitics         84   18   3  1 力挽狂瀾
## 8  HatePolitics         84   18   3  1     我本
## 9  HatePolitics         84   18   3  1     認為
## 10 HatePolitics         84   18   3  1     要選
## 11 HatePolitics         84   18   3  1     總統
## 12 HatePolitics         84   18   3  1     意思
## 13 HatePolitics         84   18   3  1     最好
## 14 HatePolitics         84   18   3  1     時機
## 15 HatePolitics         84   18   3  1     屁股
## 16 HatePolitics         84   18   3  1     還沒
## 17 HatePolitics         84   18   3  1     坐熱
## 18 HatePolitics         84   18   3  1   豈不是
## 19 HatePolitics         84   18   3  1   朱立倫
## 20 HatePolitics         84   18   3  1 前車之鑑

詞頻

計算所有字在“單一文集”中的總詞頻

word_count <- tokens%>% 
   group_by(id,artTitle,artDate,word) %>% 
  summarise(count = n()) %>%
  ungroup()

計算所有字在“所有文集”中的總詞頻

tokens_count <- word_count %>% 
   group_by(word) %>% 
  summarise(count = sum(count)) %>% 
  arrange(desc(count))

tokens %>%
  count(word, sort = TRUE) %>%
  top_n(10) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  ylab("出現次數") +
  coord_flip()

## Selecting by n

詞頻文字雲

wordcloud2(tokens_count, size = 1)

分析總統大選前後討論內容的差異

總統大選前文字雲

word_count %>% 
  filter(artDate == as.Date('2020-01-11')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>20) %>%   # 過濾出現太少次的字
  wordcloud2()

wordcloud

總統大選後文字雲

word_count %>% 
  filter(artDate == as.Date('2020-01-13')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>20) %>%   # 過濾出現太少次的字
  wordcloud2()

$wordcloud$

wordcloud

一月十一日總統大選後，“罷免”字眼成為熱門

罷免聲量

word_count %>% 
  filter(word == "罷免" | word =="下台" | word == "罷韓" | word == "光復高雄") %>% 
  ggplot(aes(x=artDate,y=count))+
  geom_bar(stat = "identity")+
  scale_x_date(date_breaks = "1 month", date_labels = "%m/%d")

“罷免”達到巔峰的十個日子

word_count %>% 
  filter(word == "罷免" | word =="下台" | word == "罷韓" | word == "光復高雄") %>% 
  group_by(artDate = artDate) %>% 
  summarise(count=sum(count)) %>% 
  arrange(desc(count)) %>% 
  slice(1:10)

## # A tibble: 10 x 2
##    artDate    count
##    <date>     <int>
##  1 2020-04-08   110
##  2 2020-01-13    91
##  3 2019-12-22    89
##  4 2020-01-11    89
##  5 2019-12-21    84
##  6 2020-01-30    74
##  7 2020-01-14    72
##  8 2019-09-13    70
##  9 2019-12-11    69
## 10 2019-06-27    65

2020年04月：韓國瑜遞狀聲請停止執行罷免案、提出普篩制度、提出海軍官兵若拒絕疫調將開罰
2020年01月：總統大選 2019年12月：總統大選辯論會、韓國瑜上博恩夜夜秀、wecare 大遊行
2019年09月：韓國瑜否定“挖石油”言論、郭台銘退出國民黨
2019年06月：韓國瑜請假表公開、學生當面嗆韓

草包聲量

word_count %>% 
  filter(word == "草包") %>% 
  ggplot(aes(x=artDate,y=count))+
  geom_bar(stat = "identity")+
  scale_x_date(date_breaks = "1 month", date_labels = "%m/%d")

“草包”達到巔峰的十個日子

word_count %>%
  filter(word == "草包") %>% 
  group_by(artDate) %>% 
  summarise(count=sum(count)) %>% 
  arrange(desc(count)) %>% 
  slice(1:10)

## # A tibble: 10 x 2
##    artDate    count
##    <date>     <int>
##  1 2019-05-13    35
##  2 2019-12-01    33
##  3 2019-05-03    31
##  4 2019-05-06    31
##  5 2019-05-04    28
##  6 2019-05-17    27
##  7 2019-05-07    26
##  8 2020-01-11    22
##  9 2019-08-29    21
## 10 2019-12-03    21

2019年05月：草包之歌流傳
2019年12月：總統大選辯論會、韓國瑜上博恩夜夜秀、wecare 大遊行
2020年01月：總統大選
2019年08月：晶晶體風波

情緒分析

準備LIWC字典

全名Linguistic Inquiry and Word Counts，由心理學家Pennebaker於2001出版

以LIWC字典判斷文集中的word屬於正面字還是負面字

# 正向字典txt檔
# 以,將字分隔
P <- read_file("positive.txt")

# 負向字典txt檔
N <- read_file("negative.txt")

#字典txt檔讀進來是一個字串
typeof(P)

## [1] "character"

#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive",stringsAsFactors = F)
N = data.frame(word = N, sentiment = "negative",stringsAsFactors = F)

LIWC = rbind(P, N)

與LIWC情緒字典做join

文集中的字出現在LIWC字典中是屬於positive還是negative

tokens_count %>% 
  inner_join(LIWC) %>% 
  arrange(desc(count)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(sentiment) %>% 
  top_n(n=10,wt=count) %>% 
  ungroup() %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = TRUE) +
  labs(x = NULL, y = "詞頻") +
  facet_wrap(~sentiment, ncol = 2, scales = "free") +
  coord_flip()#轉置

## Joining, by = "word"

以LIWC情緒字典分析

統計每天的文章正面字的次數與負面字的次數

sentiment_count <- word_count %>%
  select(artDate,word,count) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))

## Joining, by = "word"

過去一年的情緒起伏

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date( date_labels = "%m/%d")

1月11日總統大選後，情緒起伏

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(date_labels = "%m/%d",limits = as.Date(c("2020-01-01","2020-03-01")))+
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2020/01/11')) [1]])),colour = "black",linetype="dashed",alpha=.3)

## Warning: Removed 572 row(s) containing missing values (geom_path).

將資料轉換為Document Term Matrix (DTM)

tokens_dtm <- tokens %>%
  count(artUrl, word) %>%
  rename(count=n)


fish_dtm <- tokens_dtm %>% 
  cast_dtm(artUrl, word, count) 
fish_dtm

## <<DocumentTermMatrix (documents: 23735, terms: 110456)>>
## Non-/sparse entries: 1317797/2620355363
## Sparsity           : 100%
## Maximal term length: 14
## Weighting          : term frequency (tf)

inspect(fish_dtm[1:10,1:10])  #查看前十筆資料

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 12/88
## Sparsity           : 88%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## Sample             :
##                                                              Terms
## Docs                                                          九二共識 人物
##   https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html        1    1
##   https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html        0    0
##                                                              Terms
## Docs                                                          力量 土包子 不好
##   https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html    2      3    1
##   https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html    0      0    0
##                                                              Terms
## Docs                                                          中心 內容 分享
##   https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html    2    1    1
##   https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html    0    1    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html    0    1    0
##                                                              Terms
## Docs                                                          午宴 心得
##   https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html    1    1
##   https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html    0    0

查看DTM矩陣，可以發現是個稀疏矩陣。

建立LDA模型

fish_lda <- LDA(fish_dtm, k = 2, control = list(seed = 1234))

$\phi$ Matrix

查看$\phi$ matrix (topic * term)

topics <- tidy(fish_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
topics

## # A tibble: 220,912 x 3
##    topic term          beta
##    <int> <chr>        <dbl>
##  1     1 九二共識 0.000114 
##  2     2 九二共識 0.000170 
##  3     1 人物     0.000654 
##  4     2 人物     0.000865 
##  5     1 力量     0.000645 
##  6     2 力量     0.000120 
##  7     1 土包子   0.0000430
##  8     2 土包子   0.0000439
##  9     1 不好     0.000474 
## 10     2 不好     0.000774 
## # ... with 220,902 more rows

從topics中可以得到特定主題生成特定詞彙的概率。

尋找Topic的代表字

top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(topic = as.factor(topic),
      term = reorder_within(term, beta, topic)) %>%
    ggplot(aes(term, beta, fill = topic))  +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()+
  scale_x_reordered()

尋找兩個主題差異最大的詞彙

topics %>%
  mutate( topic = paste0("topic",topic)) %>%   #根據主題名稱重新命名欄位
  spread(topic, beta) %>%
  filter( topic1 > .001 |topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))%>%
  filter(abs(log_ratio) > 3.95) %>%
  mutate(term = reorder(term, log_ratio)) %>%   #依據log_ratio值(主題類別)排序詞項
  ggplot(aes(log_ratio, term)) +
  geom_col(show.legend = FALSE)

1.上圖中，左側為主題一，右側為主題二。
2.透過上方的兩張圖，感覺兩個主題看起來差不多，沒有明顯的差異，嘗試看看分多一點topics。

LDA後續分析

根據前面的探索之後，我們對於資料有更加了解，並且看完每個主題數的LDAvis之後，選定主題數10的結果來作後續的分析查看各個主題的單詞組成比率

fish_lda = ldas[[3]] ## 選定topic 為10 的結果

topics <- tidy(fish_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
topics

## # A tibble: 1,104,560 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 九二共識 1.18e-24
##  2     2 九二共識 1.31e-15
##  3     3 九二共識 3.86e-14
##  4     4 九二共識 2.93e-30
##  5     5 九二共識 4.98e-24
##  6     6 九二共識 2.04e-21
##  7     7 九二共識 6.24e-19
##  8     8 九二共識 1.37e-16
##  9     9 九二共識 5.17e-29
## 10    10 九二共識 1.29e- 3
## # ... with 1,104,550 more rows

每一行代表一個主題中的一個詞彙

尋找Topic的代表字

整理出每一個Topic中生成概率最高的10個詞彙。

#取出每一個Topic中生成概率最高(beta值最高)的10個詞彙
top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>% 
  ungroup() %>%
  arrange(topic, -beta)

#繪製長條圖
top_terms %>%
  mutate(topic = as.factor(topic),
      term = reorder_within(term, beta, topic)) %>%
    ggplot(aes(term, beta, fill = topic))  +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()+
  scale_x_reordered()

可以看到topic都被一開始所使用的搜尋關鍵字影響看不出每一群的差異。

移除常出現、跨主題共享的詞彙。

remove_word = c("韓國瑜","高雄","總統","市長","韓粉","國民黨","韓導","韓總","一定","完全","比較","還要","表示","新聞","記者","去年","問題")
top_terms <- topics %>%
  filter(!term  %in% remove_word)%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms %>%
  mutate(topic = as.factor(topic),
      term = reorder_within(term, beta, topic)) %>%
    ggplot(aes(term, beta, fill = topic))  +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()+
  scale_x_reordered()

可以看出每個主題主要在討論什麼了！

主題命名

topic_name = c('造勢活動','韓國瑜 V.S. 郭台銘','民調：韓 V.S.  蔡、郭、柯','議會質詢','直播狂熱粉','防疫議題','罷韓議題','黨內大佬立場','網軍黑韓','國際局勢')

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(fish_lda)
doc_pro <- tmResult$topics 
dim(doc_pro)               # nDocs(DTM) distributions over K topics

## [1] 23735    10

每篇文章都有topic的分佈，所以總共是：23735筆的文章*10個主題

cbind Document 主題分佈

查看每一篇文章的各個主題組成比率

# get document topic proportions 
document_topics <- doc_pro[fish_hatePolitics$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
fish_topic = cbind(fish_hatePolitics,document_topics_df)

查看特定主題的文章

透過找到特定文章的分佈進行排序之後，可以看到此主題的比重高的文章在討論什麼。

fish_topic %>%
  arrange(desc(`民調：韓 V.S.  蔡、郭、柯`)) %>%
  head(10)

可以看到“民調：韓 V.S. 蔡、郭、柯”這個主題主要涵蓋了“韓、蔡、郭、柯的各式民調變動”。

了解主題在時間的變化

fish_topic[,c(11:20)] =sapply(fish_topic[,c(11:20)] , as.numeric)



fish_topic %>% 
  group_by(artDate = format(artDate,"%Y%m")) %>%
  summarise_if(is.double, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
 ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

由於我們在今年2月-4月的資料太少，所以將這三個月去除。

去除筆數少月份

fish_topic %>% 
  group_by(artDate = format(artDate,"%Y%m")) %>%
  filter(artDate < 202002) %>%
  summarise_if(is.double, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
 ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

可以看出每個月的聲量，但是不能很清楚出每個月的比例

以比例了解主題時間變化

fish_topic %>% 
  group_by(artDate = format(artDate,"%Y%m")) %>%
  filter(artDate < 202002) %>%
  summarise_if(is.double, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
 ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
      scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

現在我們可以看到每個月主題的佔比了！

批踢踢韓國瑜聲量分析

讀書會第二組徐湛、張傳銘、李沂倩、郭睿紘、林益祥、徐建德

系統設置

安裝需要的packages

讀取批踢踢檔案

轉換日期格式

資料描述

資料預覽

Tokenization

詞頻

分析總統大選前後討論內容的差異

罷免聲量

草包聲量

情緒分析

準備LIWC字典

以LIWC字典判斷文集中的word屬於正面字還是負面字

與LIWC情緒字典做join

以LIWC情緒字典分析

將資料轉換為Document Term Matrix (DTM)

建立LDA模型

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

尋找Topic的代表字

尋找兩個主題差異最大的詞彙

更多主題

載入每個主題的LDA結果

透過perplexity找到最佳主題數

LDA後續分析

尋找Topic的代表字

主題命名

Document 主題分佈

cbind Document 主題分佈

查看特定主題的文章

了解主題在時間的變化

去除筆數少月份

以比例了解主題時間變化

批踢踢韓國瑜聲量分析

讀書會第二組 徐 湛、張傳銘、李沂倩、郭睿紘、林益祥、徐建德

系統設置

安裝需要的packages

讀取批踢踢檔案

轉換日期格式

資料描述

資料預覽

Tokenization

詞頻

分析總統大選前後討論內容的差異

罷免聲量

草包聲量

情緒分析

準備LIWC字典

以LIWC字典判斷文集中的word屬於正面字還是負面字

與LIWC情緒字典做join

以LIWC情緒字典分析

將資料轉換為Document Term Matrix (DTM)

建立LDA模型

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

尋找Topic的代表字

尋找兩個主題差異最大的詞彙

更多主題

載入每個主題的LDA結果

透過perplexity找到最佳主題數

LDA後續分析

尋找Topic的代表字

主題命名

Document 主題分佈

cbind Document 主題分佈

查看特定主題的文章

了解主題在時間的變化

去除筆數少月份

以比例了解主題時間變化

讀書會第二組徐湛、張傳銘、李沂倩、郭睿紘、林益祥、徐建德