系統設置

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # For ubuntu
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
# Sys.setlocale("LC_CTYPE", "cht") # For windows.

安裝需要的packages

packages = c("readr","tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

載入packages

require(readr)
## Loading required package: readr
require(tm)
## Loading required package: tm
## Loading required package: NLP
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(stringr)
## Loading required package: stringr
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(tidytext)
## Loading required package: tidytext
require(ggplot2)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
require(tidyr)
## Loading required package: tidyr
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(wordcloud2)
## Loading required package: wordcloud2
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
library(knitr)

讀取資料

資料描述: 1.關鍵字:全聯、家樂福、好事多 2.期間:2019/5/2~2020/4/29 3.來源:使用中山大家文字平台-PTT版Grossing 4.筆數:776

載入資料集

setwd("D:/OC Learn/NSYSU/Social Media Analysis/9th week/studyingcase")
merchandiser_meta <-read_csv("D:/OC Learn/NSYSU/Social Media Analysis/9th week/studyingcase/data/merchandiser_articleMetaData.csv")%>%
              mutate(sentence=gsub("[\n]{2,}", "。", sentence))
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )
merchandiser_meta

Sentences Segmentation

# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
merchandiser_sentences <- strsplit(merchandiser_meta$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
merchandiser_sentences <- data.frame(
                        artUrl = rep(merchandiser_meta$artUrl, sapply(merchandiser_sentences, length)), 
                        sentence = unlist(merchandiser_sentences)
                      ) %>%
                      filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
merchandiser_sentences$sentence <- as.character(merchandiser_sentences$sentence)
merchandiser_sentences

Tokenization

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()

# tokenize function
tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
tokens <- merchandiser_sentences %>%
  unnest_tokens(word, sentence, token=tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word) %>%
  rename(count=n)
tokens

加入識別文章的id

merchandiser <- tokens %>%
  mutate(artId = group_indices(., artUrl))
merchandiser

根據每一篇文章的Url給定一個id。

將資料轉換為Document Term Matrix (DTM)

merchandiser_dtm <-merchandiser %>% cast_dtm(artId, word, count)
merchandiser_dtm
## <<DocumentTermMatrix (documents: 776, terms: 11080)>>
## Non-/sparse entries: 36387/8561693
## Sparsity           : 100%
## Maximal term length: 7
## Weighting          : term frequency (tf)
inspect(merchandiser_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 37/63
## Sparsity           : 63%
## Maximal term length: 2
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 一同 一次 一雙 三重 下單 女款 丹鳳 介紹 內容 分配
##   1     1    1    1    1    1    1    1    1    1    1
##   10    0    0    0    0    0    0    0    0    0    0
##   2     0    0    1    0    0    0    0    1    0    0
##   3     0    1    0    0    2    0    0    1    1    1
##   4     0    0    0    0    2    0    0    1    1    1
##   5     0    1    0    0    1    0    0    1    1    1
##   6     0    1    0    0    1    0    0    1    1    1
##   7     0    0    0    0    0    0    0    1    0    0
##   8     0    1    0    0    4    0    0    1    1    1
##   9     0    0    0    0    0    0    0    0    0    0

建立LDA模型

merchandiser_lda <- LDA(merchandiser_dtm, k = 2, control = list(seed = 1234))

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

merchandiser_topics <- tidy(merchandiser_lda, matrix = "beta") # 使用"beta"來取出Phi矩陣。
merchandiser_topics

尋找Topic的代表字

merchandiser_top_terms <- merchandiser_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


merchandiser_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

把一些常出現、跨主題共享的詞彙移除。

remove_words <- c("全聯","家樂福")
merchandiser_top_terms <- merchandiser_topics %>%
  filter(! term %in% remove_words) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


merchandiser_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

查看組別間差異最大的詞

結合topic1、topic2

beta_spread <- merchandiser_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .0004 | topic2 > .0004 ) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_spread

以log_ratio觀看前十名、後十名,比較二個topic差異

merchandiser_topic_ratio <- rbind(beta_spread %>% top_n(10,wt = log_ratio), beta_spread %>% top_n(-10, log_ratio)) %>%
  arrange(log_ratio)
merchandiser_topic_ratio

畫圖檢視

merchandiser_topic_ratio %>% 
  ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
  geom_bar(stat="identity") + 
  xlab("Word")+
  coord_flip()

移除所有出現在5篇文章以下的詞彙

reserved_word <- merchandiser %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > 5) %>% 
  unlist()

merchandiser_removed <- merchandiser %>% 
  filter( word %in% reserved_word)

將處理後的資料重新轉換成DTM

merchandiser_dtm_removed <- merchandiser_removed %>% cast_dtm(artId, word, count)

進行LDA

merchandiser_lda_removed <- LDA(merchandiser_dtm_removed, k = 2, control = list(seed = 1234))
merchandiser_topics_removed <- tidy(merchandiser_lda_removed, matrix = "beta")
merchandiser_topics_removed
merchandiser_topics_removed %>%
  filter(! term %in% remove_words) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

merchandiser_topics_removed <- tidy(merchandiser_lda_removed, matrix = "beta")

beta_spread_removed <- merchandiser_topics_removed %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .0004 | topic2 > .0004) %>%
  mutate(log_ratio = log2(topic2 / topic1))

merchandiser_removed_topic_ratio <- rbind(beta_spread_removed %>% top_n(10,wt = log_ratio), beta_spread_removed %>% top_n(-10, log_ratio)) %>% 
  arrange(log_ratio)

merchandiser_removed_topic_ratio %>% 
  ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
  geom_bar(stat="identity") + 
  coord_flip()

\(\theta\) matrix (document * topic)

查看\(\theta\) matrix

merchandiser_documents_removed <- tidy(merchandiser_lda_removed, matrix="gamma") # 在tidy function中使用參數"gamma"來取得 theta矩陣。
merchandiser_documents_removed

簡單的主題分類

使用比率較高的topic作為代表topic,觀察不同Topic的本文

merchandiser_documents_removed$document<- merchandiser_documents_removed$document %>% as.integer()
merchandiser_documents_removed %>% 
  group_by(document) %>% 
  top_n(1,gamma) %>% 
  arrange(topic) %>% 
  inner_join(merchandiser %>% distinct(artUrl,artId), by=c("document" = "artId")) %>% 
  inner_join(merchandiser_meta, by="artUrl") %>% 
  select(topic, sentence)
## Warning: Column `artUrl` joining factor and character vector, coercing into
## character vector
## Adding missing grouping variables: `document`
merchandiser_documents_removed$document <- merchandiser_documents_removed$document %>% as.integer()
merchandiser_documents_removed %>% 
  group_by(topic) %>% 
  top_n(10, wt=gamma) %>% 
  inner_join(merchandiser, by = c("document" = "artId")) %>% 
  distinct(artUrl) %>%
  inner_join(merchandiser_meta, by = "artUrl") %>% 
  select(topic, artTitle)
## Warning: Column `artUrl` joining factor and character vector, coercing into
## character vector
merchandiser_removed %>% 
  group_by(word) %>% 
  filter(! word %in% remove_words) %>% 
  summarise(sum = sum(count)) %>% 
  arrange(desc(sum)) %>% 
  wordcloud2()

按照\(\theta\)矩陣的“gamma”值劃分兩種主題的資料

從所有文章中取出一半「topic 1含量最高」的文章。

merchandiser_documents_removed_spread <- merchandiser_documents_removed %>% 
  mutate(topic = paste0("topic", topic)) %>% 
  spread(topic, gamma)

half_num = round(nrow(merchandiser_removed %>% distinct(artUrl))/2) # 原始文章數量的一半

topic1_id <- merchandiser_documents_removed_spread %>%   # 取出topic_1最高的half_num篇文章
  top_n(half_num, topic1) %>%  
  select(document) %>% 
  unlist()

topic2_id <- merchandiser_documents_removed_spread$document %>% 
  setdiff(topic1_id)