require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(data.table)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
require(scales)
## Loading required package: scales
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(tidytext)
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(topicmodels)
library(readr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(widyr)
library(ggraph)

將網路爬到的booking.com針對澎湖住宿的評價匯入

setwd("C:/learning/mid")
booking<- fread("booking_reviews.csv")

分出民宿及飯店評價滿分的飯店

bhs<-(booking[grepl("宿|村|子|屋|墅|巷|舍|園|棧|house|home", booking$HotelName),]) 
bhs<-(bhs[grepl("10", bhs$Rate),])
bht<-(booking[grepl("店|館|中心|文旅", booking$HotelName),]) 
bht<-(bht[grepl("10", bht$Rate),])

自訂user word及停用字並用結巴斷詞

jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")

book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}
bhs$Review=as.character(bhs$Review)
tidybookbhs = bhs %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,word,Id)

bht$Review=as.character(bht$Review)
tidybookbht = bht %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,word,Id)

針對全住宿評價做斷詞

tidybook = booking %>% 
  unnest_tokens(word,Review,token= book_tokenizer) %>%
  count(HotelName,word) %>% 
  filter(word!='好極了') %>% 
  filter(word!='傑出') %>% 
  filter(word!='很棒') %>% 
    rename(count=n) %>% 
  mutate(Id = group_indices(., HotelName)) 

將資料轉換為Document Term Matrix (DTM)

token= tidybook
tidybook_dtm <- token %>% cast_dtm(Id, word, count)
tidybook_dtm
## <<DocumentTermMatrix (documents: 227, terms: 3461)>>
## Non-/sparse entries: 11894/773753
## Sparsity           : 98%
## Maximal term length: 16
## Weighting          : term frequency (tf)

建立LDA模型 分兩群#查看ϕϕ matrix (topic * term), 從book_topics中可以得到特定主題生成特定詞彙的概率。

尋找Topic的代表字

book_lda <- LDA(tidybook_dtm, k = 2, control = list(seed = 1234))

查看ϕϕ matrix (topic * term), 從book_topics中可以得到特定主題生成特定詞彙的概率。

book_topics <- tidy(book_lda, matrix = "beta")
# 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
book_topics
## # A tibble: 6,922 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     1 80    0.000257
##  2     2 80    0.000117
##  3     1 nice  0.00483 
##  4     2 nice  0.00288 
##  5     1 小孩  0.000206
##  6     2 小孩  0.000751
##  7     1 不太  0.000257
##  8     2 不太  0.000117
##  9     1 不錯  0.00863 
## 10     2 不錯  0.00962 
## # ... with 6,912 more rows

尋找Topic的代表字

book_top_terms <- book_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


book_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

book_top_terms
## # A tibble: 20 x 3
##    topic term    beta
##    <int> <chr>  <dbl>
##  1     1 令人  0.0257
##  2     1 民宿  0.0234
##  3     1 愉悅  0.0205
##  4     1 老闆  0.0189
##  5     1 舒適  0.0182
##  6     1 親切  0.0162
##  7     1 服務  0.0151
##  8     1 房間  0.0147
##  9     1 住宿  0.0138
## 10     1 乾淨  0.0134
## 11     2 令人  0.0257
## 12     2 民宿  0.0224
## 13     2 愉悅  0.0211
## 14     2 老闆  0.0166
## 15     2 房間  0.0160
## 16     2 尚可  0.0146
## 17     2 舒適  0.0134
## 18     2 乾淨  0.0129
## 19     2 服務  0.0126
## 20     2 住宿  0.0112

計算評價滿分民宿的評語之間的Co-occurrence:

node_name=fread(file = "c:/learning/mid/word.txt", encoding='UTF-8',header=F)
nnode_name=fread(file = "c:/learning/mid/word1.txt", encoding='UTF-8',header=F)
term_cooccurrence_m=tidybookbhs %>%
  filter(word  %in% node_name$V1) %>%   
  pairwise_count(word, Id, sort = TRUE,diag=F)



term_cooccurrence_m=as.data.frame(term_cooccurrence_m)

移除重複的pairwise

for (i in 1:nrow(term_cooccurrence_m)){
    term_cooccurrence_m[i, ] = sort(term_cooccurrence_m[i,])
}

term_cooccurrence_m=term_cooccurrence_m[!duplicated(term_cooccurrence_m),]
names(term_cooccurrence_m)=c('weight','item1','item2')
term_cooccurrence_m=term_cooccurrence_m %>%  select(item1,item2,weight)
term_cooccurrence_m$weight=as.numeric(term_cooccurrence_m$weight)

畫出Co-occurrence網路圖

g=term_cooccurrence_m %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:7]='#00DD00'
node_name$V2[8:15]='#FFAA33'
node_name$V2[16:23]='#EEEE00'
node_name$V2[24:30]='#ff00dd'

V(g)$color=sapply(names(V(g)), function(v){
  node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

對評價滿分的飯店計算評語之間的Co-occurrence

###計算評語之間的Co-occurrence
term_cooccurrence_m=tidybookbht %>%
  filter(word  %in% node_name$V1) %>%   
  pairwise_count(word, Id, sort = TRUE,diag=F)

term_cooccurrence_m=as.data.frame(term_cooccurrence_m)

###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_m)){
    term_cooccurrence_m[i, ] = sort(term_cooccurrence_m[i,])
}

term_cooccurrence_m=term_cooccurrence_m[!duplicated(term_cooccurrence_m),]
names(term_cooccurrence_m)=c('weight','item1','item2')
term_cooccurrence_m=term_cooccurrence_m %>%  select(item1,item2,weight)
term_cooccurrence_m$weight=as.numeric(term_cooccurrence_m$weight)

###畫Co-occurrence網路圖
g=term_cooccurrence_m %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)


node_name$V2=NA
node_name$V2[1:7]='#00DD00'
node_name$V2[8:15]='#FFAA33'
node_name$V2[16:23]='#EEEE00'
node_name$V2[24:30]='#ff00dd'



V(g)$color=sapply(names(V(g)), function(v){
  node_name$V2[node_name$V1==v]
})


set.seed(0525)
layout11 <- layout.fruchterman.reingold(g)
plot(g, layout=layout11, pt.cex=1, cex=.8)

畫出Co-occurrence網路圖

g=term_cooccurrence_m %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:7]='#00DD00'
node_name$V2[8:15]='#FFAA33'
node_name$V2[16:23]='#EEEE00'
node_name$V2[24:30]='#ff00dd'

V(g)$color=sapply(names(V(g)), function(v){
  node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout11 <- layout.fruchterman.reingold(g)
plot(g, layout=layout11, pt.cex=1, cex=.8)

以Degree作為頂點大小

degree(g, mode="all")
##   住宿   入住   服務   傑出   值得   房間   很棒   下次   老闆   推薦 
##     26     27     26     27     26     25     26     26     24     26 
##   方便   民宿   乾淨   親切   舒適   澎湖   行程   地點   早餐   舒服 
##     22     24     25     26     26     25     23     17     22     22 
##   感覺 老闆娘   不錯   貼心   熱心   令人 好極了   環境 
##     24     18     15     22     23     21      3     25
deg <- degree(g, mode="all")

plot(g, vertex.size=deg*1.2)
legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

以Closeness作為頂點大小

closeness(g, mode="all", weights=NA, normalized=T)
##      住宿      入住      服務      傑出      值得      房間      很棒 
## 0.9642857 1.0000000 0.9642857 1.0000000 0.9642857 0.9310345 0.9642857 
##      下次      老闆      推薦      方便      民宿      乾淨      親切 
## 0.9642857 0.9000000 0.9642857 0.8437500 0.9000000 0.9310345 0.9642857 
##      舒適      澎湖      行程      地點      早餐      舒服      感覺 
## 0.9642857 0.9310345 0.8709677 0.7297297 0.8437500 0.8437500 0.9000000 
##    老闆娘      不錯      貼心      熱心      令人    好極了      環境 
## 0.7500000 0.6923077 0.8437500 0.8709677 0.8181818 0.5294118 0.9310345
deg <- closeness(g, mode="all" , weights=NA, normalized=T)

plot(g, vertex.size=deg*20)

legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

以betweenness作為頂點大小

betweenness(g, directed=F, weights=NA, normalized = T)
##         住宿         入住         服務         傑出         值得 
## 0.0061411132 0.0332066402 0.0061411132 0.0332066402 0.0061411132 
##         房間         很棒         下次         老闆         推薦 
## 0.0044387578 0.0061411132 0.0061411132 0.0020703976 0.0061411132 
##         方便         民宿         乾淨         親切         舒適 
## 0.0024352191 0.0020703976 0.0044387578 0.0061411132 0.0061411132 
##         澎湖         行程         地點         早餐         舒服 
## 0.0038090198 0.0008644593 0.0003798670 0.0013770180 0.0004212811 
##         感覺       老闆娘         不錯         貼心         熱心 
## 0.0020703976 0.0143806810 0.0000000000 0.0005788450 0.0014372858 
##         令人       好極了         環境 
## 0.0008379420 0.0000000000 0.0052406499
deg <- betweenness(g, directed=F, weights=NA, normalized = T)

plot(g, vertex.size=deg*1000)

legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

一. 確立動機和分析目的 延續期中報告,以booking.com的住客評論來進行文字資料LDA模型分析和社會網絡 路徑分析. 二. 尋找適當的資料集和資料集的描述. 以網路爬蟲抓下 Booking約320家澎湖旅館和民宿業者的住客評論, 有HotelName,Country,Rate,Review,ReviewDate,ReviewNeg,ReviewPos七欄位,共 12255筆資料錄. 三. 資料的分析過程 如程式 四. 視覺化的分析結果與解釋

1.建立LDA模型,分成兩類,依主題代表字來看,第一類為人員的軟服務,例如老闆,       老闆娘,親切,服務,熱情,貼心,推薦等等,第二類為硬體設施,如房間,早餐,乾淨       ,舒適,地點,隔音等等.
2.利用社群網路圖,大約可看出有幾個議題面向,和這些面向消費者關注那些事項,       例如房間:乾淨,不錯
      民宿:舒適,很棒
      老闆/老闆娘:親切,熱心
      澎湖:行程,地點
      早餐:服務,乾淨,環境
      入住:值得,環境
      住宿:方便,民宿,老闆,舒適
      推薦:住宿,服務,環境,早餐

五.結論
民宿與飯店,在評語分析得到結果,消費者關心的事項不全然相同。