系統參數設定
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼## [1] ""
安裝需要的packages
packages = c("readr","dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "data.table","NLP","ggraph","igraph","reshape2", "widyr", "magrittr", "topicmodels", "yaml")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)載入packages
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
require(data.table)
require(readr)
require(NLP)
require(ggraph)
require(igraph)
require(reshape2)
require(widyr)
require(plotly)
require(magrittr)
require(topicmodels)
library(tm)
library(purrr)
library(LDAvis)
library(slam)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)載入原始資料
MetaData = fread("articleMetaData.csv",encoding = 'UTF-8')#文章內容
Reviews = fread("articleReviews.csv",encoding = 'UTF-8')#留言將7-11資料獨立建立data.frame
#7-11
keywords_7 = c('7-11','小7','711')
toMatch_7 = paste(keywords_7,collapse="|")
MetaData_7 = with(MetaData,MetaData[grepl(toMatch_7,sentence)|grepl(toMatch_7,artTitle),])#可以再篩掉
Reviews_7 = left_join(MetaData_7, Reviews[,c("artUrl", "cmtContent")], by = "artUrl") #留言MetaData_7 <- MetaData_7 %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|【商品名稱/價格】|【活動名稱】|【便利商店/廠商名稱】|【規格/內容物/熱量】|【評分】|【活動時間】", "", sentence))# # 文章斷句("\n\n"取代成"。")
# meta <- MetaData_7 %>%
# mutate(sentence=gsub("[\n]{2,}", "。", sentence))
#
# # 以全形或半形 驚歎號、問號、分號 以及 #全形句號 爲依據進行斷句
# sentences_7 <- strsplit(meta$sentence,"[。!;?!?;]")
#
# #將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
# sentences_7 <- data.frame(
# artUrl = rep(meta$artUrl, sapply(sentences_7, length)),
# sentence = unlist(sentences_7)
# ) %>%
# filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
# # 如果有\t或\n就去掉
#
# sentences_7$sentence <- as.character(sentences_7$sentence)
# sentences_7設定斷詞引擎
# # 文章斷詞
# #load dpp_lexicon(特定要斷開的詞,像是user_dict)
# lexicon_7 <- scan(file = "/Users/user/Documents/R/midtermproject/final711/lexicon_7.txt",
# what=character(),sep='\n',
# encoding='utf-8',fileEncoding='utf-8')
# #load stop words
# stopwords_7 <- scan(file = "/Users/user/Documents/R/midtermproject/final711/stopwords_7.txt",
# what=character(),sep='\n',
# encoding='utf-8',fileEncoding='utf-8')
#
# # 使用默認參數初始化一個斷詞引擎
# jieba_tokenizer = worker()
#
# # 使用自訂字典重新斷詞
# new_user_word(jieba_tokenizer, c(lexicon_7))
#
# #tokenize function
# tokenizer_7 <- function(t) {
# lapply(t, function(x) {
# if(nchar(x)>1){
# tokens <- segment(x, jieba_tokenizer)
# tokens <- tokens[!tokens %in% stop_words]
# # 去掉字串長度爲1的詞彙
# tokens <- tokens[nchar(tokens)>1]
# return(tokens)
# }
# })
# }
#
#
# ## 用剛剛初始化的斷詞器把sentence斷開
# tokens <- sentences_7 %>%
# mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
# mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
# unnest_tokens(word, sentence, token=tokenizer_7) %>%
# count(artUrl, word) %>% # 計算每篇文章出現的字頻
# rename(count=n)
#
# # tokens
# save.image(file = "/Users/user/Documents/R/midtermproject/final711/token_result.rdata")斷詞結果可以先存起來,就不用再重跑一次
load("/Users/user/Documents/R/midtermproject/final711/token_result.rdata")freq = 3
# 依據字頻挑字
reserved_word <- tokens %>%
group_by(word) %>%
count() %>%
filter(n > freq) %>%
unlist()
removed_7 <- tokens %>%
filter(word %in% reserved_word)
#dpp_dtm 裡面 nrow:幾篇文章 ; ncol:幾個字
dtm_7 <- removed_7 %>% cast_dtm(artUrl, word, count) # ldas = c()
# topics = c(2,4,6,10,15)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(dtm_7, k = topic, control = list(seed = 2021))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
# }儲存LDA主題資料
load("ldas_result.rdata")topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
可以看出大於四小於八的主題數perplexity較低。
library(text2vec)##
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
##
## perplexity
## The following object is masked from 'package:igraph':
##
## normalize
library(udpipe)
dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)## [1] 4315 1321
set.seed(2019)
topic_n = 4
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)## INFO [12:28:26.582] early stopping at 240 iteration
## INFO [12:28:27.504] early stopping at 40 iteration
lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字## [,1] [,2] [,3] [,4]
## [1,] "心得" "加熱" "活動" "試用品"
## [2,] "覺得" "雞肉" "店員" "試吃"
## [3,] "起來" "搭配" "請問" "可跳過"
## [4,] "口味" "實際" "謝謝" "刪除"
## [5,] "就是" "醬汁" "門市" "如有"
## [6,] "感覺" "調味" "注意事項" "項目"
## [7,] "好吃" "這款" "大家" "清楚"
## [8,] "不會" "結論" "名稱" "滿分"
## [9,] "味道" "產品" "知道" "推薦"
## [10,] "看到" "微波" "廠商" "標註"
# lda_model$plot()
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)LDAvis顯示出除了中間一個較大的主題有稍微涵蓋到另外三個,這四個主題是互不重疊的
將剛處理好的dtm放入LDA函式分析
# LDA分成3個主題
lda_7 <- LDA(dtm, k = 3, control = list(seed = 123))尋找Topic的代表字
removed_word = c("不是","每天","出來","覺得", "這次", "還是", "刪除", "心得")
# 看各群的常用詞彙
tidy(lda_7, matrix = "beta") %>% # 取出topic term beta值
filter(! term %in% removed_word) %>%
group_by(topic) %>%
top_n(10, beta) %>% # beta值前10的字
ungroup() %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()取出代表主題(topic),每篇文章拿gamma值最大的topic當該文章的topic
# 在tidy function中使用參數"gamma"來取得 theta矩陣
topics_7 <- tidy(lda_7, matrix="gamma") %>% # document topic gamma
group_by(document) %>%
top_n(1, wt=gamma)
topics_7## # A tibble: 4,315 x 3
## # Groups: document [4,315]
## document topic gamma
## <chr> <int> <dbl>
## 1 https://www.ptt.cc/bbs/CVS/M.1451750974.A.F67.html 1 0.335
## 2 https://www.ptt.cc/bbs/CVS/M.1452080610.A.F12.html 1 0.352
## 3 https://www.ptt.cc/bbs/CVS/M.1452238381.A.25C.html 1 0.339
## 4 https://www.ptt.cc/bbs/CVS/M.1452506733.A.F79.html 1 0.342
## 5 https://www.ptt.cc/bbs/CVS/M.1452692377.A.1BF.html 1 0.335
## 6 https://www.ptt.cc/bbs/CVS/M.1452700931.A.C54.html 1 0.362
## 7 https://www.ptt.cc/bbs/CVS/M.1452834637.A.308.html 1 0.342
## 8 https://www.ptt.cc/bbs/CVS/M.1453045186.A.162.html 1 0.370
## 9 https://www.ptt.cc/bbs/CVS/M.1453181617.A.62F.html 1 0.368
## 10 https://www.ptt.cc/bbs/CVS/M.1453188773.A.64C.html 1 0.345
## # ... with 4,305 more rows
posts_topic <- merge(x = MetaData_7, y = topics_7, by.x = "artUrl", by.y="document")
# 看一下各主題在說甚麼
set.seed(123)
posts_topic %>% # 主題一
filter(topic==1) %>%
select(artTitle) %>%
unique() %>%
sample_n(10)## artTitle
## 1: [商品]7-11義式番茄蔬菜湯
## 2: [商品]7-11-福樂鮮攪絲滑可可
## 3: [商品]全家芝麻霜淇淋
## 4: Re:[問題]從7-11離職想再找7-11工作但很難找
## 5: [商品]7-11叉燒豚骨拉麵新上市
## 6: [商品]7-11冰森永牛奶糖奶茶
## 7: Re:[商品]全家日式和醋涼麵
## 8: [問題]7-11交貨便大當機?
## 9: [商品]7-11卡辣姆久厚切洋芋片-極辛
## 10: [商品]7-11老虎堂黑糖波霸厚雪糕
posts_topic %>% # 主題二
filter(topic==2) %>%
select(artTitle) %>%
unique() %>%
sample_n(10)## artTitle
## 1: [商品]7-11咖哩烤雞焗飯
## 2: [討論]20187-11進口啤酒
## 3: [商品]7-11鯖魚紅藜御膳便當
## 4: [感想]高雄第一間無人小七7-11星海灣門市
## 5: [商品]7-11歐姆蛋包飯
## 6: [商品]7-11御飯糰炙燒雪花牛
## 7: [問題]7-11石安牧場冷藏蛋售價
## 8: [商品]七七乳加-薄荷(全家)、迦納可可和紐西蘭-
## 9: [商品]7-11美珍香豬肉乾
## 10: [問題]7-11剛結束的KITTYxline點數一問
posts_topic %>% # 主題三
filter(topic==3) %>%
select(artTitle) %>%
unique() %>%
sample_n(10)## artTitle
## 1: [商品]7-11草莓黑雷神vsX-5草莓巧克力餅乾
## 2: [商品]7-11紫黑米紅豆牛奶
## 3: [商品]7-11北海道帆立貝飯糰蜜蜂工坊蜂蜜牛奶
## 4: [討論]7-11福袋頭獎汽車沒人領?
## 5: [商品]7-11佛蒙特咖哩飯新上市
## 6: [問題]請問7-11多彩便當台北市分布的地區?
## 7: [問題]7-11的Y拍運費支付
## 8: [商品]7-11樂事鬍鬚張聯名洋芋片
## 9: [問題]7-11現金抵用券購買預購商品
## 10: [問題]請問7-11悠遊卡可以加值零錢嗎?
可以歸納出
topic 1 = “711新出商品及問題”
topic 2 = “商品售價及點數”
topic 3 = “711服務項目及福袋聯名”
看主題數量
posts_topic %>%
group_by(artCat,topic) %>%
summarise(sum = n()) %>%
ggplot(aes(x= artCat,y=sum,fill=as.factor(topic))) +
geom_col(position="dodge") ## `summarise()` has grouped output by 'artCat'. You can override using the `.groups` argument.
可以看出三個主題討論的人都很多,以第三主題「711服務項目及福袋聯名」討論熱度最高。