Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # For ubuntu
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
# Sys.setlocale("LC_CTYPE", "cht") # For windows.
packages = c("readr","tm", "data.table", "dplyr", "stringr", "jiebaR", "tidytext", "ggplot2", "tidyr", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(readr)
## Loading required package: readr
require(tm)
## Loading required package: tm
## Loading required package: NLP
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(stringr)
## Loading required package: stringr
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(tidytext)
## Loading required package: tidytext
require(ggplot2)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
require(tidyr)
## Loading required package: tidyr
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(wordcloud2)
## Loading required package: wordcloud2
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
library(knitr)
資料描述: 1.關鍵字:全聯、家樂福、好事多 2.期間:2019/5/2~2020/4/29 3.來源:使用中山大家文字平台-PTT版Grossing 4.筆數:776
setwd("D:/OC Learn/NSYSU/Social Media Analysis/9th week/studyingcase")
merchandiser_meta <-read_csv("D:/OC Learn/NSYSU/Social Media Analysis/9th week/studyingcase/data/merchandiser_articleMetaData.csv")%>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
merchandiser_meta
# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
merchandiser_sentences <- strsplit(merchandiser_meta$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
merchandiser_sentences <- data.frame(
artUrl = rep(merchandiser_meta$artUrl, sapply(merchandiser_sentences, length)),
sentence = unlist(merchandiser_sentences)
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
merchandiser_sentences$sentence <- as.character(merchandiser_sentences$sentence)
merchandiser_sentences
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
# tokenize function
tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
tokens <- merchandiser_sentences %>%
unnest_tokens(word, sentence, token=tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word) %>%
rename(count=n)
tokens
merchandiser <- tokens %>%
mutate(artId = group_indices(., artUrl))
merchandiser
根據每一篇文章的Url給定一個id。
merchandiser_dtm <-merchandiser %>% cast_dtm(artId, word, count)
merchandiser_dtm
## <<DocumentTermMatrix (documents: 776, terms: 11080)>>
## Non-/sparse entries: 36387/8561693
## Sparsity : 100%
## Maximal term length: 7
## Weighting : term frequency (tf)
inspect(merchandiser_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 37/63
## Sparsity : 63%
## Maximal term length: 2
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 一同 一次 一雙 三重 下單 女款 丹鳳 介紹 內容 分配
## 1 1 1 1 1 1 1 1 1 1 1
## 10 0 0 0 0 0 0 0 0 0 0
## 2 0 0 1 0 0 0 0 1 0 0
## 3 0 1 0 0 2 0 0 1 1 1
## 4 0 0 0 0 2 0 0 1 1 1
## 5 0 1 0 0 1 0 0 1 1 1
## 6 0 1 0 0 1 0 0 1 1 1
## 7 0 0 0 0 0 0 0 1 0 0
## 8 0 1 0 0 4 0 0 1 1 1
## 9 0 0 0 0 0 0 0 0 0 0
merchandiser_lda <- LDA(merchandiser_dtm, k = 2, control = list(seed = 1234))
merchandiser_topics <- tidy(merchandiser_lda, matrix = "beta") # 使用"beta"來取出Phi矩陣。
merchandiser_topics
merchandiser_top_terms <- merchandiser_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
merchandiser_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
把一些常出現、跨主題共享的詞彙移除。
remove_words <- c("全聯","家樂福")
merchandiser_top_terms <- merchandiser_topics %>%
filter(! term %in% remove_words) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
merchandiser_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
結合topic1、topic2
beta_spread <- merchandiser_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .0004 | topic2 > .0004 ) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
以log_ratio觀看前十名、後十名,比較二個topic差異
merchandiser_topic_ratio <- rbind(beta_spread %>% top_n(10,wt = log_ratio), beta_spread %>% top_n(-10, log_ratio)) %>%
arrange(log_ratio)
merchandiser_topic_ratio
畫圖檢視
merchandiser_topic_ratio %>%
ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
geom_bar(stat="identity") +
xlab("Word")+
coord_flip()
移除所有出現在5篇文章以下的詞彙
reserved_word <- merchandiser %>%
group_by(word) %>%
count() %>%
filter(n > 5) %>%
unlist()
merchandiser_removed <- merchandiser %>%
filter( word %in% reserved_word)
將處理後的資料重新轉換成DTM
merchandiser_dtm_removed <- merchandiser_removed %>% cast_dtm(artId, word, count)
進行LDA
merchandiser_lda_removed <- LDA(merchandiser_dtm_removed, k = 2, control = list(seed = 1234))
merchandiser_topics_removed <- tidy(merchandiser_lda_removed, matrix = "beta")
merchandiser_topics_removed
merchandiser_topics_removed %>%
filter(! term %in% remove_words) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
merchandiser_topics_removed <- tidy(merchandiser_lda_removed, matrix = "beta")
beta_spread_removed <- merchandiser_topics_removed %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .0004 | topic2 > .0004) %>%
mutate(log_ratio = log2(topic2 / topic1))
merchandiser_removed_topic_ratio <- rbind(beta_spread_removed %>% top_n(10,wt = log_ratio), beta_spread_removed %>% top_n(-10, log_ratio)) %>%
arrange(log_ratio)
merchandiser_removed_topic_ratio %>%
ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
geom_bar(stat="identity") +
coord_flip()
merchandiser_documents_removed <- tidy(merchandiser_lda_removed, matrix="gamma") # 在tidy function中使用參數"gamma"來取得 theta矩陣。
merchandiser_documents_removed
使用比率較高的topic作為代表topic,觀察不同Topic的本文
merchandiser_documents_removed$document<- merchandiser_documents_removed$document %>% as.integer()
merchandiser_documents_removed %>%
group_by(document) %>%
top_n(1,gamma) %>%
arrange(topic) %>%
inner_join(merchandiser %>% distinct(artUrl,artId), by=c("document" = "artId")) %>%
inner_join(merchandiser_meta, by="artUrl") %>%
select(topic, sentence)
## Warning: Column `artUrl` joining factor and character vector, coercing into
## character vector
## Adding missing grouping variables: `document`
merchandiser_documents_removed$document <- merchandiser_documents_removed$document %>% as.integer()
merchandiser_documents_removed %>%
group_by(topic) %>%
top_n(10, wt=gamma) %>%
inner_join(merchandiser, by = c("document" = "artId")) %>%
distinct(artUrl) %>%
inner_join(merchandiser_meta, by = "artUrl") %>%
select(topic, artTitle)
## Warning: Column `artUrl` joining factor and character vector, coercing into
## character vector
merchandiser_removed %>%
group_by(word) %>%
filter(! word %in% remove_words) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum)) %>%
wordcloud2()
從所有文章中取出一半「topic 1含量最高」的文章。
merchandiser_documents_removed_spread <- merchandiser_documents_removed %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, gamma)
half_num = round(nrow(merchandiser_removed %>% distinct(artUrl))/2) # 原始文章數量的一半
topic1_id <- merchandiser_documents_removed_spread %>% # 取出topic_1最高的half_num篇文章
top_n(half_num, topic1) %>%
select(document) %>%
unlist()
topic2_id <- merchandiser_documents_removed_spread$document %>%
setdiff(topic1_id)