系統設置

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # For ubuntu

## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求

## [1] ""

# Sys.setlocale("LC_CTYPE", "cht") # For windows.

安裝需要的packages

packages = c("readr","tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot","purrr","ramify","RColorBrewer", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

require(readr)
require(tm)
require(data.table)
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
require(LDAvis)
require(wordcloud2)
require(webshot)
require(htmlwidgets)
require(servr)
require(purrr)
require(ramify)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

讀取資料

讀取批踢踢檔案

fish_hatePolitics <- fread(file="rawdata.csv",header = T,stringsAsFactors = F,encoding = "UTF-8",data.table = FALSE )

資料描述

2019年4月23日，高雄市市長韓國瑜宣布參選2020總統大選。
透過中山管院文字分析平台在批踢踢的HatePolitics看板，搜尋關鍵字：[韓國瑜、韓國魚、霸韓、韓導、韓總]，時間從2019-04-23~2020-04-22，為期一年的資料，總篇數為23735篇。

fish_hatePolitics %>% 
  mutate(year = year(artDate), month = month(artDate)) %>%
  group_by(year,month) %>%
  summarise(count = n())%>%
  ggplot(aes(x=month,y=count))+
  geom_bar(stat="identity")+
  facet_grid(~year)

1.可以觀察到資料主要分佈去年5-6月以及12月，今年討論聲量則明顯下降。
2.【聲量高點原因推測】
(1)2019年5-6月：韓國瑜剛宣布要參選總統，引發諸如“烙跑市長”、“選上總統，高雄上班”等議題。
(2)2020年12月：鄰近一月中的總統大選，正值“競選活動”與“總統辯論會”的高峰期。

Tokenization

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker(stop_word = "stop_words.txt")

jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
new_user_word(jieba_tokenizer,c("韓國瑜","韓國魚","不支持","柯文哲","九二共識","競選活動","罷韓遊行","扶不起的阿斗","高雄市市府","高雄市市長","造勢活動","蔡英文","時代力量"))  #自定義斷詞

## [1] TRUE

fish_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[nchar(tokens)>1]  # 去掉字串長度爲1的詞彙
      return(tokens)
    }
  })
}

tokens <- fish_hatePolitics %>%
  unnest_tokens(word, sentence, token=fish_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)

#把名稱統一
tokens$word[which(tokens$word == c("韓國魚","韓董","韓導"))] = "韓國瑜"
tokens$word[which(tokens$word == "郭董")] = "郭台銘"
tokens$word[which(tokens$word == c("柯p","柯P"))] = "柯文哲"
tokens$word[which(tokens$word == "小英")] = "蔡英文"

tokens %>% head(20)

將資料轉換為Document Term Matrix (DTM)

fish_dtm <- tokens %>% cast_dtm(artUrl, word, count) #
fish_dtm

## <<DocumentTermMatrix (documents: 23735, terms: 110521)>>
## Non-/sparse entries: 1347152/2621868783
## Sparsity           : 100%
## Maximal term length: 14
## Weighting          : term frequency (tf)

inspect(fish_dtm[1:10,1:10])  #查看前十筆資料

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 12/88
## Sparsity           : 88%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## Sample             :
##                                                              Terms
## Docs                                                          九二共識 人物
##   https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html        1    1
##   https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html        0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html        0    0
##                                                              Terms
## Docs                                                          力量 土包子 不好
##   https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html    2      3    1
##   https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html    0      0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html    0      0    0
##                                                              Terms
## Docs                                                          中心 內容 分享
##   https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html    2    1    1
##   https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html    0    1    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html    0    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html    0    1    0
##                                                              Terms
## Docs                                                          午宴 心得
##   https://www.ptt.cc/bbs/HatePolitics/M.1555068495.A.5D0.html    1    1
##   https://www.ptt.cc/bbs/HatePolitics/M.1555128913.A.636.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1555477358.A.A83.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556044230.A.F4B.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556047007.A.BD2.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556049755.A.D99.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556050417.A.5C3.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556051849.A.913.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556053627.A.B8B.html    0    0
##   https://www.ptt.cc/bbs/HatePolitics/M.1556057390.A.908.html    0    0

查看DTM矩陣，可以發現是個稀疏矩陣。

建立LDA模型

fish_lda <- LDA(fish_dtm, k = 2, control = list(seed = 1234))

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

topics <- tidy(fish_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
topics

從topics中可以得到特定主題生成特定詞彙的概率。

尋找Topic的代表字

top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(topic = as.factor(topic),
      term = reorder_within(term, beta, topic)) %>%
    ggplot(aes(term, beta, fill = topic))  +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()+
  scale_x_reordered()

尋找兩個主題差異最大的詞彙

topics %>%
  mutate( topic = paste0("topic",topic)) %>%   #根據主題名稱重新命名欄位
  spread(topic, beta) %>%
  filter( topic1 > .001 |topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))%>%
  filter(log_ratio > 10 | log_ratio < -3) %>%
  mutate(term = reorder(term, log_ratio)) %>%   #依據log_ratio值(主題類別)排序詞項
  ggplot(aes(log_ratio, term)) +
  geom_col(show.legend = FALSE)

1.上圖中，左側為主題一，右側為主題二。
2.透過上方的兩張圖，感覺兩個主題看起來差不多，沒有明顯的差異，嘗試看看分多一點topics。

LDA後續分析

根據前面的探索之後，我們對於資料有更加了解，並且看完每個主題數的LDAvis之後，選定主題數10的結果來作後續的分析查看各個主題的單詞組成比率

fish_lda = ldas[[4]] ## 選定topic 為10 的結果

topics <- tidy(fish_lda, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
topics

每一行代表一個主題中的一個詞彙

尋找Topic的代表字

整理出每一個Topic中生成概率最高的10個詞彙。

#取出每一個Topic中生成概率最高(beta值最高)的10個詞彙
top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>% 
  ungroup() %>%
  arrange(topic, -beta)

#繪製長條圖
top_terms %>%
  mutate(topic = as.factor(topic),
      term = reorder_within(term, beta, topic)) %>%
    ggplot(aes(term, beta, fill = topic))  +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()+
  scale_x_reordered()

可以看到topic都被一開始所使用的搜尋關鍵字影響看不出每一群的差異。

移除常出現、跨主題共享的詞彙。

remove_word = c("韓國瑜","高雄","韓粉","國民黨" ,"總統","已經" ,"市長","根本" ,"是不是","一直","新聞","報導","韓導","韓總")
top_terms <- topics %>%
  filter(!term  %in% remove_word)%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms %>%
  mutate(topic = as.factor(topic),
      term = reorder_within(term, beta, topic)) %>%
    ggplot(aes(term, beta, fill = topic))  +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()+
  scale_x_reordered()

可以看出每個主題主要在討論什麼了！

主題命名

topic_name = c('罷韓聯署','媒體資訊','高雄現況','黨內初選','競選行程','豪宅風雲','防疫議題','國際局勢','令人詬病的言論','直播狂熱粉')

Document 主題分佈

# for every document we have a probability distribution of its contained topics
tmResult <- posterior(fish_lda)
doc_pro <- tmResult$topics 
dim(doc_pro)               # nDocs(DTM) distributions over K topics

## [1] 23735    10

每篇文章都有topic的分佈，所以總共是：23735筆的文章*10個主題

cbind Document 主題分佈

查看每一篇文章的各個主題組成比率

# get document topic proportions 
document_topics <- doc_pro[fish_hatePolitics$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
fish_topic = cbind(fish_hatePolitics,document_topics_df)

查看特定主題的文章

透過找到特定文章的分佈進行排序之後，可以看到此主題的比重高的文章在討論什麼。

fish_topic %>%
  arrange(desc(`媒體資訊`)) %>%
  head(10)

可以看到“直播狂熱粉”這個主題主要涵蓋了“韓國瑜的狂熱粉絲 - 林小姐，透過直播的方式，在各式議題上為韓國瑜發聲”。

了解主題在時間的變化

fish_topic[,c(11:20)] =sapply(fish_topic[,c(11:20)] , as.numeric)

fish_topic$artDate= fish_topic$artDate %>% as.Date("%Y/%m/%d")

fish_topic %>% 
  group_by(artDate = format(artDate,"%Y%m")) %>%
  summarise_if(is.double, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
 ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

由於我們在今年2月~4月的資料太少，所以將這三個月去除。

去除筆數少月份

fish_topic %>% 
  group_by(artDate = format(artDate,"%Y%m")) %>%
  filter(artDate < 202002) %>%
  summarise_if(is.double, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
 ggplot( aes(x=artDate, y=value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("value") + 
  scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

可以看出每個月的聲量，但是不能很清楚出每個月的比例

以比例了解主題時間變化

fish_topic %>% 
  group_by(artDate = format(artDate,"%Y%m")) %>%
  filter(artDate < 202002) %>%
  summarise_if(is.double, sum, na.rm = TRUE) %>%
  melt(id.vars = "artDate")%>%
  group_by(artDate)%>%
  mutate(total_value =sum(value))%>%
 ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
      scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

現在我們可以看到每個月主題的佔比了！

使用LDA分析韓國瑜在批踢踢上的網路聲量

讀書會第二組

系統設置

安裝需要的packages

讀取資料

讀取批踢踢檔案

資料描述

Tokenization

將資料轉換為Document Term Matrix (DTM)

建立LDA模型

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

尋找Topic的代表字

尋找兩個主題差異最大的詞彙

更多主題

載入每個主題的LDA結果

透過perplexity找到最佳主題數

LDA後續分析

尋找Topic的代表字

主題命名

Document 主題分佈

cbind Document 主題分佈

查看特定主題的文章

了解主題在時間的變化

去除筆數少月份

以比例了解主題時間變化