系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] ""

安裝需要的packages

packages = c("readr","dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "data.table","NLP","ggraph","igraph","reshape2", "widyr", "magrittr", "topicmodels", "yaml")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

載入packages

require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
require(data.table)
require(readr)
require(NLP)
require(ggraph)
require(igraph)
require(reshape2)
require(widyr)
require(plotly)
require(magrittr)
require(topicmodels)
library(tm)
library(purrr)
library(LDAvis)
library(slam)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)

載入原始資料

MetaData = fread("articleMetaData.csv",encoding = 'UTF-8')#文章內容
Reviews  = fread("articleReviews.csv",encoding = 'UTF-8')#留言

將7-11資料獨立建立data.frame

#7-11
keywords_7 = c('7-11','小7','711')
toMatch_7 = paste(keywords_7,collapse="|")

MetaData_7 = with(MetaData,MetaData[grepl(toMatch_7,sentence)|grepl(toMatch_7,artTitle),])#可以再篩掉
Reviews_7 = left_join(MetaData_7, Reviews[,c("artUrl", "cmtContent")], by = "artUrl") #留言
MetaData_7 <- MetaData_7 %>% 
  mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|【商品名稱/價格】|【活動名稱】|【便利商店/廠商名稱】|【規格/內容物/熱量】|【評分】|【活動時間】", "", sentence))

資料清理

1. 斷詞斷句

# # 文章斷句("\n\n"取代成"。")
# meta <- MetaData_7 %>%
#             mutate(sentence=gsub("[\n]{2,}", "。", sentence))
# 
# # 以全形或半形 驚歎號、問號、分號 以及 #全形句號 爲依據進行斷句
# sentences_7 <- strsplit(meta$sentence,"[。!;?!?;]")
# 
# #將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
# sentences_7 <- data.frame(
#                        artUrl = rep(meta$artUrl, sapply(sentences_7, length)),
#                          sentence = unlist(sentences_7)
#                        ) %>%
#                        filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
#                        # 如果有\t或\n就去掉
#  
# sentences_7$sentence <- as.character(sentences_7$sentence)
# sentences_7

設定斷詞引擎

# # 文章斷詞
# #load dpp_lexicon(特定要斷開的詞,像是user_dict)
# lexicon_7 <- scan(file = "/Users/user/Documents/R/midtermproject/final711/lexicon_7.txt",
#                     what=character(),sep='\n',
#                     encoding='utf-8',fileEncoding='utf-8')
# #load stop words
# stopwords_7 <- scan(file = "/Users/user/Documents/R/midtermproject/final711/stopwords_7.txt",
#                     what=character(),sep='\n',
#                     encoding='utf-8',fileEncoding='utf-8')
#  
# # 使用默認參數初始化一個斷詞引擎
# jieba_tokenizer = worker()
# 
# # 使用自訂字典重新斷詞
# new_user_word(jieba_tokenizer, c(lexicon_7))
# 
# #tokenize function
# tokenizer_7 <- function(t) {
#     lapply(t, function(x) {
#     if(nchar(x)>1){
#     tokens <- segment(x, jieba_tokenizer)
#         tokens <- tokens[!tokens %in% stop_words]
#       # 去掉字串長度爲1的詞彙
#         tokens <- tokens[nchar(tokens)>1]
#         return(tokens)
#       }
#     })
#   }
# 
# 
# ## 用剛剛初始化的斷詞器把sentence斷開
# tokens <- sentences_7 %>%
#       mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
#       mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
#       unnest_tokens(word, sentence, token=tokenizer_7) %>%
#      count(artUrl, word) %>% # 計算每篇文章出現的字頻
#      rename(count=n)
# 
# # tokens
# save.image(file = "/Users/user/Documents/R/midtermproject/final711/token_result.rdata")

斷詞結果可以先存起來,就不用再重跑一次

load("/Users/user/Documents/R/midtermproject/final711/token_result.rdata")

2.清理斷詞結果

freq = 3
# 依據字頻挑字
reserved_word <- tokens %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > freq) %>% 
  unlist()

removed_7 <- tokens %>% 
  filter(word %in% reserved_word)

#dpp_dtm 裡面 nrow:幾篇文章 ; ncol:幾個字
dtm_7 <- removed_7 %>% cast_dtm(artUrl, word, count) 

建立LDA分析

1.決定主題數

# ldas = c()
# topics = c(2,4,6,10,15)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(dtm_7, k = topic, control = list(seed = 2021))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
# }

儲存LDA主題資料

load("ldas_result.rdata")

2.透過perplexity找到最佳主題數

topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

可以看出大於四小於八的主題數perplexity較低。

3.畫LDAvis 模型

library(text2vec)
## 
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
## 
##     perplexity
## The following object is masked from 'package:igraph':
## 
##     normalize
library(udpipe)

dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)
## [1] 4315 1321
set.seed(2019)

topic_n = 4

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)
## INFO  [12:28:26.582] early stopping at 240 iteration 
## INFO  [12:28:27.504] early stopping at 40 iteration
lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字
##       [,1]   [,2]   [,3]       [,4]    
##  [1,] "心得" "加熱" "活動"     "試用品"
##  [2,] "覺得" "雞肉" "店員"     "試吃"  
##  [3,] "起來" "搭配" "請問"     "可跳過"
##  [4,] "口味" "實際" "謝謝"     "刪除"  
##  [5,] "就是" "醬汁" "門市"     "如有"  
##  [6,] "感覺" "調味" "注意事項" "項目"  
##  [7,] "好吃" "這款" "大家"     "清楚"  
##  [8,] "不會" "結論" "名稱"     "滿分"  
##  [9,] "味道" "產品" "知道"     "推薦"  
## [10,] "看到" "微波" "廠商"     "標註"
# lda_model$plot()
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)

LDAvis顯示出除了中間一個較大的主題有稍微涵蓋到另外三個,這四個主題是互不重疊的

4.主題分類

將剛處理好的dtm放入LDA函式分析

# LDA分成3個主題
lda_7 <- LDA(dtm, k = 3, control = list(seed = 123))

尋找Topic的代表字

removed_word = c("不是","每天","出來","覺得", "這次", "還是", "刪除", "心得")

# 看各群的常用詞彙
tidy(lda_7, matrix = "beta") %>% # 取出topic term beta值
  filter(! term %in% removed_word) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>% # beta值前10的字
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

取出代表主題(topic),每篇文章拿gamma值最大的topic當該文章的topic

# 在tidy function中使用參數"gamma"來取得 theta矩陣
topics_7 <- tidy(lda_7, matrix="gamma") %>% # document topic gamma
                  group_by(document) %>%
                  top_n(1, wt=gamma)
topics_7
## # A tibble: 4,315 x 3
## # Groups:   document [4,315]
##    document                                           topic gamma
##    <chr>                                              <int> <dbl>
##  1 https://www.ptt.cc/bbs/CVS/M.1451750974.A.F67.html     1 0.335
##  2 https://www.ptt.cc/bbs/CVS/M.1452080610.A.F12.html     1 0.352
##  3 https://www.ptt.cc/bbs/CVS/M.1452238381.A.25C.html     1 0.339
##  4 https://www.ptt.cc/bbs/CVS/M.1452506733.A.F79.html     1 0.342
##  5 https://www.ptt.cc/bbs/CVS/M.1452692377.A.1BF.html     1 0.335
##  6 https://www.ptt.cc/bbs/CVS/M.1452700931.A.C54.html     1 0.362
##  7 https://www.ptt.cc/bbs/CVS/M.1452834637.A.308.html     1 0.342
##  8 https://www.ptt.cc/bbs/CVS/M.1453045186.A.162.html     1 0.370
##  9 https://www.ptt.cc/bbs/CVS/M.1453181617.A.62F.html     1 0.368
## 10 https://www.ptt.cc/bbs/CVS/M.1453188773.A.64C.html     1 0.345
## # ... with 4,305 more rows

資料內容探索

posts_topic <- merge(x = MetaData_7, y = topics_7, by.x = "artUrl", by.y="document")

# 看一下各主題在說甚麼
set.seed(123)
posts_topic %>% # 主題一
  filter(topic==1) %>%
  select(artTitle) %>%
  unique() %>%
  sample_n(10)
##                                      artTitle
##  1:                  [商品]7-11義式番茄蔬菜湯
##  2:               [商品]7-11-福樂鮮攪絲滑可可
##  3:                      [商品]全家芝麻霜淇淋
##  4: Re:[問題]從7-11離職想再找7-11工作但很難找
##  5:              [商品]7-11叉燒豚骨拉麵新上市
##  6:                [商品]7-11冰森永牛奶糖奶茶
##  7:                 Re:[商品]全家日式和醋涼麵
##  8:                  [問題]7-11交貨便大當機?
##  9:         [商品]7-11卡辣姆久厚切洋芋片-極辛
## 10:            [商品]7-11老虎堂黑糖波霸厚雪糕
posts_topic %>% # 主題二
  filter(topic==2) %>%
  select(artTitle) %>%
  unique() %>%
  sample_n(10)
##                                         artTitle
##  1:                       [商品]7-11咖哩烤雞焗飯
##  2:                       [討論]20187-11進口啤酒
##  3:                   [商品]7-11鯖魚紅藜御膳便當
##  4:       [感想]高雄第一間無人小七7-11星海灣門市
##  5:                         [商品]7-11歐姆蛋包飯
##  6:                   [商品]7-11御飯糰炙燒雪花牛
##  7:                 [問題]7-11石安牧場冷藏蛋售價
##  8: [商品]七七乳加-薄荷(全家)、迦納可可和紐西蘭-
##  9:                       [商品]7-11美珍香豬肉乾
## 10:         [問題]7-11剛結束的KITTYxline點數一問
posts_topic %>% # 主題三
  filter(topic==3) %>%
  select(artTitle) %>%
  unique() %>%
  sample_n(10)
##                                       artTitle
##  1:    [商品]7-11草莓黑雷神vsX-5草莓巧克力餅乾
##  2:                   [商品]7-11紫黑米紅豆牛奶
##  3: [商品]7-11北海道帆立貝飯糰蜜蜂工坊蜂蜜牛奶
##  4:             [討論]7-11福袋頭獎汽車沒人領?
##  5:               [商品]7-11佛蒙特咖哩飯新上市
##  6:    [問題]請問7-11多彩便當台北市分布的地區?
##  7:                    [問題]7-11的Y拍運費支付
##  8:             [商品]7-11樂事鬍鬚張聯名洋芋片
##  9:           [問題]7-11現金抵用券購買預購商品
## 10:        [問題]請問7-11悠遊卡可以加值零錢嗎?

可以歸納出
topic 1 = “711新出商品及問題”
topic 2 = “商品售價及點數”
topic 3 = “711服務項目及福袋聯名”

看主題數量

posts_topic %>%
  group_by(artCat,topic) %>%
  summarise(sum = n())  %>%
  ggplot(aes(x= artCat,y=sum,fill=as.factor(topic))) +
  geom_col(position="dodge") 
## `summarise()` has grouped output by 'artCat'. You can override using the `.groups` argument.

可以看出三個主題討論的人都很多,以第三主題「711服務項目及福袋聯名」討論熱度最高。