library(data.table)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(tidyr)
library(tidytext)
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(tm)
## Loading required package: NLP
library(stringr)
library(widyr)
library(ggraph)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
setwd("C:/learning/mid")
bh <- fread("booking_hotels.csv")
booking<- fread("booking_reviews.csv")
bhs<-(booking[grepl("宿|村|子|屋|墅|巷|舍|園|棧|house|home", booking$HotelName),])

bht<-(booking[grepl("店|館|中心|文旅", booking$HotelName),])

自訂user word及停用字並用結巴斷詞

jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")

book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

對booking做民宿分類,並斷詞

bhs$Review=as.character(bhs$Review)
tidybookbhs = bhs %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,word,Id)

str(tidybookbhs)
## 'data.frame':    12582 obs. of  3 variables:
##  $ HotelName: chr  "天空格子商旅" "天空格子商旅" "天空格子商旅" "天空格子商旅" ...
##  $ word     : chr  "hen" "棒棒" "乾淨" "新穎" ...
##  $ Id       : int  18 18 18 18 18 18 18 18 18 18 ...
head(tidybookbhs)
##      HotelName     word Id
## 1 天空格子商旅      hen 18
## 2 天空格子商旅     棒棒 18
## 3 天空格子商旅     乾淨 18
## 4 天空格子商旅     新穎 18
## 5 天空格子商旅 服務態度 18
## 6 天空格子商旅     很棒 18

計算民宿評語之間的Co-occurrence:

###取民宿出現最多的前30個評價詞做node
node_name=fread(file = "c:/learning/mid/word.txt", encoding='UTF-8',header=F)


term_cooccurrence_m=tidybookbhs %>%
  filter(word  %in% node_name$V1) %>%   
  pairwise_count(word, Id, sort = TRUE,diag=F)



term_cooccurrence_m=as.data.frame(term_cooccurrence_m)

移除重複的pairwise:

for (i in 1:nrow(term_cooccurrence_m)){
    term_cooccurrence_m[i, ] = sort(term_cooccurrence_m[i,])
}

term_cooccurrence_m=term_cooccurrence_m[!duplicated(term_cooccurrence_m),]
names(term_cooccurrence_m)=c('weight','item1','item2')
term_cooccurrence_m=term_cooccurrence_m %>%  select(item1,item2,weight)
term_cooccurrence_m$weight=as.numeric(term_cooccurrence_m$weight)
head(term_cooccurrence_m)
##     item1  item2 weight
## 1  好極了   傑出    125
## 3    很棒   傑出    115
## 5  好極了   很棒     95
## 7    民宿   傑出     93
## 9    民宿 好極了     83
## 11   傑出   舒適     81

畫出Co-occurrence網路圖

g=term_cooccurrence_m %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:8]='#00DD00'
node_name$V2[9:15]='#FFAA33'
node_name$V2[16:21]='#EEEE00'


V(g)$color=sapply(names(V(g)), function(v){
  node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

算4分位數,對民宿進行分群

quantile(bhs$Rate)
##   0%  25%  50%  75% 100% 
##  2.5  8.8  9.6 10.0 10.0

將民宿分三群

bhs1 <- (bhs[grepl("10", bhs$Rate),])
bhs2 <- filter(bhs, Rate > 8.8 & Rate <10)
bhs3 <- filter(bhs, Rate <= 8.8)

評價10的民宿

bhs1$Review=as.character(bhs1$Review)
tidybookbhs1 = bhs1 %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,word,Id)

str(tidybookbhs1)
## 'data.frame':    6478 obs. of  3 variables:
##  $ HotelName: chr  "天空格子商旅" "天空格子商旅" "天空格子商旅" "天空格子商旅" ...
##  $ word     : chr  "hen" "棒棒" "傑出" "傑出" ...
##  $ Id       : int  17 17 17 17 17 17 17 17 17 17 ...
head(tidybookbhs1)
##      HotelName word Id
## 1 天空格子商旅  hen 17
## 2 天空格子商旅 棒棒 17
## 3 天空格子商旅 傑出 17
## 4 天空格子商旅 傑出 17
## 5 天空格子商旅 住宿 17
## 6 天空格子商旅 地點 17

計算評語之間的Co-occurrence:

term_cooccurrence_m1=tidybookbhs1 %>%
  filter(word  %in% node_name$V1) %>%   
  pairwise_count(word, Id, sort = TRUE,diag=F)



term_cooccurrence_m1=as.data.frame(term_cooccurrence_m1)

移除重複的pairwise

for (i in 1:nrow(term_cooccurrence_m1)){
    term_cooccurrence_m1[i, ] = sort(term_cooccurrence_m1[i,])
}

term_cooccurrence_m1=term_cooccurrence_m1[!duplicated(term_cooccurrence_m1),]
names(term_cooccurrence_m1)=c('weight','item1','item2')
term_cooccurrence_m1=term_cooccurrence_m1 %>%  select(item1,item2,weight)
term_cooccurrence_m1$weight=as.numeric(term_cooccurrence_m1$weight)

畫出Co-occurrence網路圖

g=term_cooccurrence_m1 %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:8]='#00DD00'
node_name$V2[9:15]='#FFAA33'
node_name$V2[16:21]='#EEEE00'


V(g)$color=sapply(names(V(g)), function(v){
  node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout11 <- layout.fruchterman.reingold(g)
plot(g, layout=layout11, pt.cex=1, cex=.8)

評價大於8.8小於10的飯店

###對民宿正評做斷詞
tidybookbhs2 = bhs2 %>% unnest_tokens(word,Review,token= book_tokenizer) %>%  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,word,Id)

###計算評語之間的Co-occurrence
term_cooccurrence_m2=tidybookbhs2 %>%
  filter(word  %in% node_name$V1) %>%   
  pairwise_count(word, Id, sort = TRUE,diag=F)

term_cooccurrence_m2=as.data.frame(term_cooccurrence_m2)

###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_m2)){
    term_cooccurrence_m2[i, ] = sort(term_cooccurrence_m2[i,])
}

term_cooccurrence_m2=term_cooccurrence_m2[!duplicated(term_cooccurrence_m2),]
names(term_cooccurrence_m2)=c('weight','item1','item2')
term_cooccurrence_m2=term_cooccurrence_m2 %>%  select(item1,item2,weight)
term_cooccurrence_m2$weight=as.numeric(term_cooccurrence_m2$weight)

###畫出負評Co-occurrence網路圖
g=term_cooccurrence_m2 %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)


node_name$V2=NA
node_name$V2[1:8]='#00DD00'
node_name$V2[9:15]='#FFAA33'
node_name$V2[16:21]='#EEEE00'


V(g)$color=sapply(names(V(g)), function(v){
  node_name$V2[node_name$V1==v]
})


set.seed(0525)
layout12 <- layout.fruchterman.reingold(g)
plot(g, layout=layout12, pt.cex=1, cex=.8)

評價小於等於8.8的民宿

###對民宿正評做斷詞
tidybookbhs3 = bhs3 %>% unnest_tokens(word,Review,token= book_tokenizer) %>%  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,word,Id)

###計算評語之間的Co-occurrence
term_cooccurrence_m3=tidybookbhs3 %>%
  filter(word  %in% node_name$V1) %>%   
  pairwise_count(word, Id, sort = TRUE,diag=F)

term_cooccurrence_m3=as.data.frame(term_cooccurrence_m3)

###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_m3)){
    term_cooccurrence_m3[i, ] = sort(term_cooccurrence_m3[i,])
}

term_cooccurrence_m3=term_cooccurrence_m3[!duplicated(term_cooccurrence_m3),]
names(term_cooccurrence_m3)=c('weight','item1','item2')
term_cooccurrence_m3=term_cooccurrence_m3 %>%  select(item1,item2,weight)
term_cooccurrence_m3$weight=as.numeric(term_cooccurrence_m3$weight)

###畫出正評Co-occurrence網路圖
g=term_cooccurrence_m3 %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)


node_name$V2=NA
node_name$V2[1:8]='#00DD00'
node_name$V2[9:15]='#FFAA33'
node_name$V2[16:21]='#EEEE00'


V(g)$color=sapply(names(V(g)), function(v){
  node_name$V2[node_name$V1==v]
})


set.seed(0525)
layout13 <- layout.fruchterman.reingold(g)
plot(g, layout=layout13, pt.cex=1, cex=.8)

對booking做民宿分類,並對負評斷詞

tidybookbhsn = bhs %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>%  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,wordn,Id)

str(tidybookbhsn)
## 'data.frame':    10834 obs. of  3 variables:
##  $ HotelName: chr  "天空格子商旅" "天空格子商旅" "天空格子商旅" "天空格子商旅" ...
##  $ wordn    : chr  "枕頭" "支撐" "不足" "洗手台" ...
##  $ Id       : int  16 16 16 16 16 16 16 16 16 16 ...
head(tidybookbhsn)
##      HotelName  wordn Id
## 1 天空格子商旅   枕頭 16
## 2 天空格子商旅   支撐 16
## 3 天空格子商旅   不足 16
## 4 天空格子商旅 洗手台 16
## 5 天空格子商旅   浴室 16
## 6 天空格子商旅   澗水 16

計算評語之間的Co-occurrence

###取民宿出現最多的前30個評價詞做node
nnode_name=fread(file = "c:/learning/mid/wordn.txt", encoding='UTF-8',header=F)
term_cooccurrence_mn=tidybookbhsn %>%
  filter(wordn  %in% nnode_name$V1) %>%   
  pairwise_count(wordn, Id, sort = TRUE,diag=F)



term_cooccurrence_mn=as.data.frame(term_cooccurrence_mn)
###移除重複的pairwise

for (i in 1:nrow(term_cooccurrence_mn)){
    term_cooccurrence_mn[i, ] = sort(term_cooccurrence_mn[i,])
}

term_cooccurrence_mn=term_cooccurrence_mn[!duplicated(term_cooccurrence_mn),]
names(term_cooccurrence_mn)=c('weight','item1','item2')
term_cooccurrence_mn=term_cooccurrence_mn %>%  select(item1,item2,weight)
term_cooccurrence_mn$weight=as.numeric(term_cooccurrence_mn$weight)

畫出Co-occurrence網路圖

g=term_cooccurrence_mn %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
nnode_name$V2=NA
nnode_name$V2[1:8]='#00DD00'
nnode_name$V2[9:15]='#FFAA33'
nnode_name$V2[16:21]='#EEEE00'


V(g)$color=sapply(names(V(g)), function(v){
  nnode_name$V2[nnode_name$V1==v]
})
set.seed(0525)
layout2 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

評價10的民宿

###對評價10的民宿負評做斷詞
tidybookbhsn1 = bhs1 %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>%  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,wordn,Id)

###計算評語之間的Co-occurrence
term_cooccurrence_mn1=tidybookbhsn1 %>%
  filter(wordn  %in% nnode_name$V1) %>%   
  pairwise_count(wordn, Id, sort = TRUE,diag=F)

term_cooccurrence_mn1=as.data.frame(term_cooccurrence_mn1)

###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mn1)){
    term_cooccurrence_mn1[i, ] = sort(term_cooccurrence_mn1[i,])
}

term_cooccurrence_mn1=term_cooccurrence_mn1[!duplicated(term_cooccurrence_mn1),]
names(term_cooccurrence_mn1)=c('weight','item1','item2')
term_cooccurrence_mn1=term_cooccurrence_mn1 %>%  select(item1,item2,weight)
term_cooccurrence_mn1$weight=as.numeric(term_cooccurrence_mn1$weight)

###畫出負評Co-occurrence網路圖
g=term_cooccurrence_mn1 %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)


nnode_name$V2=NA
nnode_name$V2[1:8]='#00DD00'
nnode_name$V2[9:15]='#FFAA33'
nnode_name$V2[16:21]='#EEEE00'


V(g)$color=sapply(names(V(g)), function(v){
  nnode_name$V2[nnode_name$V1==v]
})


set.seed(0525)
layout21 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

評價大於8.8小於10的飯店

###對飯民宿負評做斷詞
tidybookbhsn2 = bhs2 %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>%  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,wordn,Id)

###計算評語之間的Co-occurrence
term_cooccurrence_mn2=tidybookbhsn2 %>%
  filter(wordn  %in% nnode_name$V1) %>%   
  pairwise_count(wordn, Id, sort = TRUE,diag=F)

term_cooccurrence_mn2=as.data.frame(term_cooccurrence_mn2)

###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mn2)){
    term_cooccurrence_mn2[i, ] = sort(term_cooccurrence_mn2[i,])
}

term_cooccurrence_mn2=term_cooccurrence_mn2[!duplicated(term_cooccurrence_mn2),]
names(term_cooccurrence_mn2)=c('weight','item1','item2')
term_cooccurrence_mn2=term_cooccurrence_mn2 %>%  select(item1,item2,weight)
term_cooccurrence_mn2$weight=as.numeric(term_cooccurrence_mn2$weight)

###畫出負評Co-occurrence網路圖
g=term_cooccurrence_mn2 %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)


nnode_name$V2=NA
nnode_name$V2[1:8]='#00DD00'
nnode_name$V2[9:15]='#FFAA33'
nnode_name$V2[16:21]='#EEEE00'


V(g)$color=sapply(names(V(g)), function(v){
  nnode_name$V2[nnode_name$V1==v]
})


set.seed(0525)
layout22 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

評價小於8.8的民宿

###對民宿負評做斷詞
tidybookbhsn3 = bhs1 %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>%  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,wordn,Id)

###計算評語之間的Co-occurrence
term_cooccurrence_mn3=tidybookbhsn3 %>%
  filter(wordn  %in% nnode_name$V1) %>%   
  pairwise_count(wordn, Id, sort = TRUE,diag=F)

term_cooccurrence_mn3=as.data.frame(term_cooccurrence_mn3)

###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mn3)){
    term_cooccurrence_mn3[i, ] = sort(term_cooccurrence_mn3[i,])
}

term_cooccurrence_mn3=term_cooccurrence_mn3[!duplicated(term_cooccurrence_mn3),]
names(term_cooccurrence_mn3)=c('weight','item1','item2')
term_cooccurrence_mn3=term_cooccurrence_mn3 %>%  select(item1,item2,weight)
term_cooccurrence_mn3$weight=as.numeric(term_cooccurrence_mn3$weight)

###畫出負評Co-occurrence網路圖
g=term_cooccurrence_mn3 %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)


nnode_name$V2=NA
nnode_name$V2[1:8]='#00DD00'
nnode_name$V2[9:15]='#FFAA33'
nnode_name$V2[16:21]='#EEEE00'


V(g)$color=sapply(names(V(g)), function(v){
  nnode_name$V2[nnode_name$V1==v]
})


set.seed(0525)
layout23 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

對booking做飯店分類,並斷詞

bht$Review=as.character(bht$Review)
tidybookbht = bht %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,word,Id)

str(tidybookbht)
## 'data.frame':    8293 obs. of  3 variables:
##  $ HotelName: chr  "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" ...
##  $ word     : chr  "服務" "人員" "態度" "傑出" ...
##  $ Id       : int  28 28 28 28 28 28 28 28 28 28 ...
head(tidybookbht)
##    HotelName   word Id
## 1 雅霖大飯店   服務 28
## 2 雅霖大飯店   人員 28
## 3 雅霖大飯店   態度 28
## 4 雅霖大飯店   傑出 28
## 5 雅霖大飯店   舒服 28
## 6 雅霖大飯店 好極了 28

計算飯店評語之間的Co-occurrence:

###取出現最多的前30個評價詞做node
tnode_name=fread(file = "c:/learning/mid/word1.txt", encoding='UTF-8',header=F)

###計算飯店評語之間的Co-occurrence:
term_cooccurrence_mht=tidybookbht %>%
  filter(word  %in% tnode_name$V1) %>%   
  pairwise_count(word, Id, sort = TRUE,diag=F)

term_cooccurrence_mht=as.data.frame(term_cooccurrence_mht)

###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mht)){
    term_cooccurrence_mht[i, ] = sort(term_cooccurrence_mht[i,])
}

term_cooccurrence_mht=term_cooccurrence_mht[!duplicated(term_cooccurrence_mht),]
names(term_cooccurrence_mht)=c('weight','item1','item2')
term_cooccurrence_mht=term_cooccurrence_mht %>%  select(item1,item2,weight)
term_cooccurrence_mht$weight=as.numeric(term_cooccurrence_mht$weight)

###畫出正評Co-occurrence網路圖
g=term_cooccurrence_mht %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)


tnode_name$V2=NA
tnode_name$V2[1:8]='#00DD00'
tnode_name$V2[9:15]='#FFAA33'
tnode_name$V2[16:21]='#EEEE00'


V(g)$color=sapply(names(V(g)), function(v){
  tnode_name$V2[tnode_name$V1==v]
})


set.seed(0525)
layout3 <- layout.fruchterman.reingold(g)
plot(g, layout=layout3, pt.cex=1, cex=.8)

對booking飯店分分類,並對負評斷詞

tidybookbhtn = bht %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>%  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,wordn,Id)

str(tidybookbhtn)
## 'data.frame':    13111 obs. of  3 variables:
##  $ HotelName: chr  "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" ...
##  $ wordn    : chr  "這次" "住宿" "陽台" "每天" ...
##  $ Id       : int  24 24 24 24 24 24 24 24 24 24 ...
head(tidybookbhtn)
##    HotelName wordn Id
## 1 雅霖大飯店  這次 24
## 2 雅霖大飯店  住宿 24
## 3 雅霖大飯店  陽台 24
## 4 雅霖大飯店  每天 24
## 5 雅霖大飯店  早餐 24
## 6 雅霖大飯店  套餐 24

計算評語之間的Co-occurrence

###取出現最多的前30個評價詞做node
tnnode_name=fread(file = "c:/learning/mid/wordn1.txt", encoding='UTF-8',header=F)

term_cooccurrence_mhtn=tidybookbhtn %>%
  filter(wordn  %in% tnnode_name$V1) %>%   
  pairwise_count(wordn, Id, sort = TRUE,diag=F)

term_cooccurrence_mhtn=as.data.frame(term_cooccurrence_mhtn)

###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mhtn)){
    term_cooccurrence_mhtn[i, ] = sort(term_cooccurrence_mhtn[i,])
}

term_cooccurrence_mhtn=term_cooccurrence_mhtn[!duplicated(term_cooccurrence_mhtn),]
names(term_cooccurrence_mhtn)=c('weight','item1','item2')
term_cooccurrence_mhtn=term_cooccurrence_mhtn %>%  select(item1,item2,weight)
term_cooccurrence_mhtn$weight=as.numeric(term_cooccurrence_mhtn$weight)

###畫出負評Co-occurrence網路圖
g=term_cooccurrence_mhtn %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)


tnnode_name$V2=NA
tnnode_name$V2[1:8]='#00DD00'
tnnode_name$V2[9:15]='#FFAA33'
tnnode_name$V2[16:21]='#EEEE00'


V(g)$color=sapply(names(V(g)), function(v){
  tnnode_name$V2[tnnode_name$V1==v]
})


set.seed(0525)
layout4 <- layout.fruchterman.reingold(g)
plot(g, layout=layout4, pt.cex=1, cex=.8)