require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(scales)
## Loading required package: scales
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(tidytext)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(topicmodels)
library(readr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(widyr)
library(ggraph)
setwd("C:/learning/mid")
booking<- fread("booking_reviews.csv")
bhs<-(booking[grepl("宿|村|子|屋|墅|巷|舍|園|棧|house|home", booking$HotelName),])
bhs<-(bhs[grepl("10", bhs$Rate),])
bht<-(booking[grepl("店|館|中心|文旅", booking$HotelName),])
bht<-(bht[grepl("10", bht$Rate),])
jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")
book_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
bhs$Review=as.character(bhs$Review)
tidybookbhs = bhs %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
mutate(Id = group_indices(., HotelName)) %>% select(HotelName,word,Id)
bht$Review=as.character(bht$Review)
tidybookbht = bht %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
mutate(Id = group_indices(., HotelName)) %>% select(HotelName,word,Id)
tidybook = booking %>%
unnest_tokens(word,Review,token= book_tokenizer) %>%
count(HotelName,word) %>%
filter(word!='好極了') %>%
filter(word!='傑出') %>%
filter(word!='很棒') %>%
rename(count=n) %>%
mutate(Id = group_indices(., HotelName))
token= tidybook
tidybook_dtm <- token %>% cast_dtm(Id, word, count)
tidybook_dtm
## <<DocumentTermMatrix (documents: 227, terms: 3461)>>
## Non-/sparse entries: 11894/773753
## Sparsity : 98%
## Maximal term length: 16
## Weighting : term frequency (tf)
book_lda <- LDA(tidybook_dtm, k = 2, control = list(seed = 1234))
book_topics <- tidy(book_lda, matrix = "beta")
# 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
book_topics
## # A tibble: 6,922 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 80 0.000257
## 2 2 80 0.000117
## 3 1 nice 0.00483
## 4 2 nice 0.00288
## 5 1 小孩 0.000206
## 6 2 小孩 0.000751
## 7 1 不太 0.000257
## 8 2 不太 0.000117
## 9 1 不錯 0.00863
## 10 2 不錯 0.00962
## # ... with 6,912 more rows
book_top_terms <- book_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
book_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
book_top_terms
## # A tibble: 20 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 令人 0.0257
## 2 1 民宿 0.0234
## 3 1 愉悅 0.0205
## 4 1 老闆 0.0189
## 5 1 舒適 0.0182
## 6 1 親切 0.0162
## 7 1 服務 0.0151
## 8 1 房間 0.0147
## 9 1 住宿 0.0138
## 10 1 乾淨 0.0134
## 11 2 令人 0.0257
## 12 2 民宿 0.0224
## 13 2 愉悅 0.0211
## 14 2 老闆 0.0166
## 15 2 房間 0.0160
## 16 2 尚可 0.0146
## 17 2 舒適 0.0134
## 18 2 乾淨 0.0129
## 19 2 服務 0.0126
## 20 2 住宿 0.0112
node_name=fread(file = "c:/learning/mid/word.txt", encoding='UTF-8',header=F)
nnode_name=fread(file = "c:/learning/mid/word1.txt", encoding='UTF-8',header=F)
term_cooccurrence_m=tidybookbhs %>%
filter(word %in% node_name$V1) %>%
pairwise_count(word, Id, sort = TRUE,diag=F)
term_cooccurrence_m=as.data.frame(term_cooccurrence_m)
for (i in 1:nrow(term_cooccurrence_m)){
term_cooccurrence_m[i, ] = sort(term_cooccurrence_m[i,])
}
term_cooccurrence_m=term_cooccurrence_m[!duplicated(term_cooccurrence_m),]
names(term_cooccurrence_m)=c('weight','item1','item2')
term_cooccurrence_m=term_cooccurrence_m %>% select(item1,item2,weight)
term_cooccurrence_m$weight=as.numeric(term_cooccurrence_m$weight)
g=term_cooccurrence_m %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:7]='#00DD00'
node_name$V2[8:15]='#FFAA33'
node_name$V2[16:23]='#EEEE00'
node_name$V2[24:30]='#ff00dd'
V(g)$color=sapply(names(V(g)), function(v){
node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)
###計算評語之間的Co-occurrence
term_cooccurrence_m=tidybookbht %>%
filter(word %in% node_name$V1) %>%
pairwise_count(word, Id, sort = TRUE,diag=F)
term_cooccurrence_m=as.data.frame(term_cooccurrence_m)
###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_m)){
term_cooccurrence_m[i, ] = sort(term_cooccurrence_m[i,])
}
term_cooccurrence_m=term_cooccurrence_m[!duplicated(term_cooccurrence_m),]
names(term_cooccurrence_m)=c('weight','item1','item2')
term_cooccurrence_m=term_cooccurrence_m %>% select(item1,item2,weight)
term_cooccurrence_m$weight=as.numeric(term_cooccurrence_m$weight)
###畫Co-occurrence網路圖
g=term_cooccurrence_m %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:7]='#00DD00'
node_name$V2[8:15]='#FFAA33'
node_name$V2[16:23]='#EEEE00'
node_name$V2[24:30]='#ff00dd'
V(g)$color=sapply(names(V(g)), function(v){
node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout11 <- layout.fruchterman.reingold(g)
plot(g, layout=layout11, pt.cex=1, cex=.8)
g=term_cooccurrence_m %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:7]='#00DD00'
node_name$V2[8:15]='#FFAA33'
node_name$V2[16:23]='#EEEE00'
node_name$V2[24:30]='#ff00dd'
V(g)$color=sapply(names(V(g)), function(v){
node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout11 <- layout.fruchterman.reingold(g)
plot(g, layout=layout11, pt.cex=1, cex=.8)
degree(g, mode="all")
## 住宿 入住 服務 傑出 值得 房間 很棒 下次 老闆 推薦
## 26 27 26 27 26 25 26 26 24 26
## 方便 民宿 乾淨 親切 舒適 澎湖 行程 地點 早餐 舒服
## 22 24 25 26 26 25 23 17 22 22
## 感覺 老闆娘 不錯 貼心 熱心 令人 好極了 環境
## 24 18 15 22 23 21 3 25
deg <- degree(g, mode="all")
plot(g, vertex.size=deg*1.2)
legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)
closeness(g, mode="all", weights=NA, normalized=T)
## 住宿 入住 服務 傑出 值得 房間 很棒
## 0.9642857 1.0000000 0.9642857 1.0000000 0.9642857 0.9310345 0.9642857
## 下次 老闆 推薦 方便 民宿 乾淨 親切
## 0.9642857 0.9000000 0.9642857 0.8437500 0.9000000 0.9310345 0.9642857
## 舒適 澎湖 行程 地點 早餐 舒服 感覺
## 0.9642857 0.9310345 0.8709677 0.7297297 0.8437500 0.8437500 0.9000000
## 老闆娘 不錯 貼心 熱心 令人 好極了 環境
## 0.7500000 0.6923077 0.8437500 0.8709677 0.8181818 0.5294118 0.9310345
deg <- closeness(g, mode="all" , weights=NA, normalized=T)
plot(g, vertex.size=deg*20)
legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)
betweenness(g, directed=F, weights=NA, normalized = T)
## 住宿 入住 服務 傑出 值得
## 0.0061411132 0.0332066402 0.0061411132 0.0332066402 0.0061411132
## 房間 很棒 下次 老闆 推薦
## 0.0044387578 0.0061411132 0.0061411132 0.0020703976 0.0061411132
## 方便 民宿 乾淨 親切 舒適
## 0.0024352191 0.0020703976 0.0044387578 0.0061411132 0.0061411132
## 澎湖 行程 地點 早餐 舒服
## 0.0038090198 0.0008644593 0.0003798670 0.0013770180 0.0004212811
## 感覺 老闆娘 不錯 貼心 熱心
## 0.0020703976 0.0143806810 0.0000000000 0.0005788450 0.0014372858
## 令人 好極了 環境
## 0.0008379420 0.0000000000 0.0052406499
deg <- betweenness(g, directed=F, weights=NA, normalized = T)
plot(g, vertex.size=deg*1000)
legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)
一. 確立動機和分析目的 延續期中報告,以booking.com的住客評論來進行文字資料LDA模型分析和社會網絡 路徑分析. 二. 尋找適當的資料集和資料集的描述. 以網路爬蟲抓下 Booking約320家澎湖旅館和民宿業者的住客評論, 有HotelName,Country,Rate,Review,ReviewDate,ReviewNeg,ReviewPos七欄位,共 12255筆資料錄. 三. 資料的分析過程 如程式 四. 視覺化的分析結果與解釋
1.建立LDA模型,分成兩類,依主題代表字來看,第一類為人員的軟服務,例如老闆, 老闆娘,親切,服務,熱情,貼心,推薦等等,第二類為硬體設施,如房間,早餐,乾淨 ,舒適,地點,隔音等等.
2.利用社群網路圖,大約可看出有幾個議題面向,和這些面向消費者關注那些事項, 例如房間:乾淨,不錯
民宿:舒適,很棒
老闆/老闆娘:親切,熱心
澎湖:行程,地點
早餐:服務,乾淨,環境
入住:值得,環境
住宿:方便,民宿,老闆,舒適
推薦:住宿,服務,環境,早餐
五.結論
民宿與飯店,在評語分析得到結果,消費者關心的事項不全然相同。