算4分位數,對民宿進行分群
quantile(bhs$Rate)
## 0% 25% 50% 75% 100%
## 2.5 8.8 9.6 10.0 10.0
將民宿分三群
bhs1 <- (bhs[grepl("10", bhs$Rate),])
bhs2 <- filter(bhs, Rate > 8.8 & Rate <10)
bhs3 <- filter(bhs, Rate <= 8.8)
評價10的民宿
bhs1$Review=as.character(bhs1$Review)
tidybookbhs1 = bhs1 %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
mutate(Id = group_indices(., HotelName)) %>% select(HotelName,word,Id)
str(tidybookbhs1)
## 'data.frame': 6478 obs. of 3 variables:
## $ HotelName: chr "天空格子商旅" "天空格子商旅" "天空格子商旅" "天空格子商旅" ...
## $ word : chr "hen" "棒棒" "傑出" "傑出" ...
## $ Id : int 17 17 17 17 17 17 17 17 17 17 ...
head(tidybookbhs1)
## HotelName word Id
## 1 天空格子商旅 hen 17
## 2 天空格子商旅 棒棒 17
## 3 天空格子商旅 傑出 17
## 4 天空格子商旅 傑出 17
## 5 天空格子商旅 住宿 17
## 6 天空格子商旅 地點 17
計算評語之間的Co-occurrence:
term_cooccurrence_m1=tidybookbhs1 %>%
filter(word %in% node_name$V1) %>%
pairwise_count(word, Id, sort = TRUE,diag=F)
term_cooccurrence_m1=as.data.frame(term_cooccurrence_m1)
移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_m1)){
term_cooccurrence_m1[i, ] = sort(term_cooccurrence_m1[i,])
}
term_cooccurrence_m1=term_cooccurrence_m1[!duplicated(term_cooccurrence_m1),]
names(term_cooccurrence_m1)=c('weight','item1','item2')
term_cooccurrence_m1=term_cooccurrence_m1 %>% select(item1,item2,weight)
term_cooccurrence_m1$weight=as.numeric(term_cooccurrence_m1$weight)
畫出Co-occurrence網路圖
g=term_cooccurrence_m1 %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:8]='#00DD00'
node_name$V2[9:15]='#FFAA33'
node_name$V2[16:21]='#EEEE00'
V(g)$color=sapply(names(V(g)), function(v){
node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout11 <- layout.fruchterman.reingold(g)
plot(g, layout=layout11, pt.cex=1, cex=.8)

評價大於8.8小於10的飯店
###對民宿正評做斷詞
tidybookbhs2 = bhs2 %>% unnest_tokens(word,Review,token= book_tokenizer) %>% mutate(Id = group_indices(., HotelName)) %>% select(HotelName,word,Id)
###計算評語之間的Co-occurrence
term_cooccurrence_m2=tidybookbhs2 %>%
filter(word %in% node_name$V1) %>%
pairwise_count(word, Id, sort = TRUE,diag=F)
term_cooccurrence_m2=as.data.frame(term_cooccurrence_m2)
###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_m2)){
term_cooccurrence_m2[i, ] = sort(term_cooccurrence_m2[i,])
}
term_cooccurrence_m2=term_cooccurrence_m2[!duplicated(term_cooccurrence_m2),]
names(term_cooccurrence_m2)=c('weight','item1','item2')
term_cooccurrence_m2=term_cooccurrence_m2 %>% select(item1,item2,weight)
term_cooccurrence_m2$weight=as.numeric(term_cooccurrence_m2$weight)
###畫出負評Co-occurrence網路圖
g=term_cooccurrence_m2 %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:8]='#00DD00'
node_name$V2[9:15]='#FFAA33'
node_name$V2[16:21]='#EEEE00'
V(g)$color=sapply(names(V(g)), function(v){
node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout12 <- layout.fruchterman.reingold(g)
plot(g, layout=layout12, pt.cex=1, cex=.8)

評價小於等於8.8的民宿
###對民宿正評做斷詞
tidybookbhs3 = bhs3 %>% unnest_tokens(word,Review,token= book_tokenizer) %>% mutate(Id = group_indices(., HotelName)) %>% select(HotelName,word,Id)
###計算評語之間的Co-occurrence
term_cooccurrence_m3=tidybookbhs3 %>%
filter(word %in% node_name$V1) %>%
pairwise_count(word, Id, sort = TRUE,diag=F)
term_cooccurrence_m3=as.data.frame(term_cooccurrence_m3)
###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_m3)){
term_cooccurrence_m3[i, ] = sort(term_cooccurrence_m3[i,])
}
term_cooccurrence_m3=term_cooccurrence_m3[!duplicated(term_cooccurrence_m3),]
names(term_cooccurrence_m3)=c('weight','item1','item2')
term_cooccurrence_m3=term_cooccurrence_m3 %>% select(item1,item2,weight)
term_cooccurrence_m3$weight=as.numeric(term_cooccurrence_m3$weight)
###畫出正評Co-occurrence網路圖
g=term_cooccurrence_m3 %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:8]='#00DD00'
node_name$V2[9:15]='#FFAA33'
node_name$V2[16:21]='#EEEE00'
V(g)$color=sapply(names(V(g)), function(v){
node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout13 <- layout.fruchterman.reingold(g)
plot(g, layout=layout13, pt.cex=1, cex=.8)

對booking做民宿分類,並對負評斷詞
tidybookbhsn = bhs %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% mutate(Id = group_indices(., HotelName)) %>% select(HotelName,wordn,Id)
str(tidybookbhsn)
## 'data.frame': 10834 obs. of 3 variables:
## $ HotelName: chr "天空格子商旅" "天空格子商旅" "天空格子商旅" "天空格子商旅" ...
## $ wordn : chr "枕頭" "支撐" "不足" "洗手台" ...
## $ Id : int 16 16 16 16 16 16 16 16 16 16 ...
head(tidybookbhsn)
## HotelName wordn Id
## 1 天空格子商旅 枕頭 16
## 2 天空格子商旅 支撐 16
## 3 天空格子商旅 不足 16
## 4 天空格子商旅 洗手台 16
## 5 天空格子商旅 浴室 16
## 6 天空格子商旅 澗水 16
計算評語之間的Co-occurrence
###取民宿出現最多的前30個評價詞做node
nnode_name=fread(file = "c:/learning/mid/wordn.txt", encoding='UTF-8',header=F)
term_cooccurrence_mn=tidybookbhsn %>%
filter(wordn %in% nnode_name$V1) %>%
pairwise_count(wordn, Id, sort = TRUE,diag=F)
term_cooccurrence_mn=as.data.frame(term_cooccurrence_mn)
###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mn)){
term_cooccurrence_mn[i, ] = sort(term_cooccurrence_mn[i,])
}
term_cooccurrence_mn=term_cooccurrence_mn[!duplicated(term_cooccurrence_mn),]
names(term_cooccurrence_mn)=c('weight','item1','item2')
term_cooccurrence_mn=term_cooccurrence_mn %>% select(item1,item2,weight)
term_cooccurrence_mn$weight=as.numeric(term_cooccurrence_mn$weight)
畫出Co-occurrence網路圖
g=term_cooccurrence_mn %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
nnode_name$V2=NA
nnode_name$V2[1:8]='#00DD00'
nnode_name$V2[9:15]='#FFAA33'
nnode_name$V2[16:21]='#EEEE00'
V(g)$color=sapply(names(V(g)), function(v){
nnode_name$V2[nnode_name$V1==v]
})
set.seed(0525)
layout2 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

評價10的民宿
###對評價10的民宿負評做斷詞
tidybookbhsn1 = bhs1 %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% mutate(Id = group_indices(., HotelName)) %>% select(HotelName,wordn,Id)
###計算評語之間的Co-occurrence
term_cooccurrence_mn1=tidybookbhsn1 %>%
filter(wordn %in% nnode_name$V1) %>%
pairwise_count(wordn, Id, sort = TRUE,diag=F)
term_cooccurrence_mn1=as.data.frame(term_cooccurrence_mn1)
###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mn1)){
term_cooccurrence_mn1[i, ] = sort(term_cooccurrence_mn1[i,])
}
term_cooccurrence_mn1=term_cooccurrence_mn1[!duplicated(term_cooccurrence_mn1),]
names(term_cooccurrence_mn1)=c('weight','item1','item2')
term_cooccurrence_mn1=term_cooccurrence_mn1 %>% select(item1,item2,weight)
term_cooccurrence_mn1$weight=as.numeric(term_cooccurrence_mn1$weight)
###畫出負評Co-occurrence網路圖
g=term_cooccurrence_mn1 %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
nnode_name$V2=NA
nnode_name$V2[1:8]='#00DD00'
nnode_name$V2[9:15]='#FFAA33'
nnode_name$V2[16:21]='#EEEE00'
V(g)$color=sapply(names(V(g)), function(v){
nnode_name$V2[nnode_name$V1==v]
})
set.seed(0525)
layout21 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

評價大於8.8小於10的飯店
###對飯民宿負評做斷詞
tidybookbhsn2 = bhs2 %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% mutate(Id = group_indices(., HotelName)) %>% select(HotelName,wordn,Id)
###計算評語之間的Co-occurrence
term_cooccurrence_mn2=tidybookbhsn2 %>%
filter(wordn %in% nnode_name$V1) %>%
pairwise_count(wordn, Id, sort = TRUE,diag=F)
term_cooccurrence_mn2=as.data.frame(term_cooccurrence_mn2)
###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mn2)){
term_cooccurrence_mn2[i, ] = sort(term_cooccurrence_mn2[i,])
}
term_cooccurrence_mn2=term_cooccurrence_mn2[!duplicated(term_cooccurrence_mn2),]
names(term_cooccurrence_mn2)=c('weight','item1','item2')
term_cooccurrence_mn2=term_cooccurrence_mn2 %>% select(item1,item2,weight)
term_cooccurrence_mn2$weight=as.numeric(term_cooccurrence_mn2$weight)
###畫出負評Co-occurrence網路圖
g=term_cooccurrence_mn2 %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
nnode_name$V2=NA
nnode_name$V2[1:8]='#00DD00'
nnode_name$V2[9:15]='#FFAA33'
nnode_name$V2[16:21]='#EEEE00'
V(g)$color=sapply(names(V(g)), function(v){
nnode_name$V2[nnode_name$V1==v]
})
set.seed(0525)
layout22 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

評價小於8.8的民宿
###對民宿負評做斷詞
tidybookbhsn3 = bhs1 %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% mutate(Id = group_indices(., HotelName)) %>% select(HotelName,wordn,Id)
###計算評語之間的Co-occurrence
term_cooccurrence_mn3=tidybookbhsn3 %>%
filter(wordn %in% nnode_name$V1) %>%
pairwise_count(wordn, Id, sort = TRUE,diag=F)
term_cooccurrence_mn3=as.data.frame(term_cooccurrence_mn3)
###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mn3)){
term_cooccurrence_mn3[i, ] = sort(term_cooccurrence_mn3[i,])
}
term_cooccurrence_mn3=term_cooccurrence_mn3[!duplicated(term_cooccurrence_mn3),]
names(term_cooccurrence_mn3)=c('weight','item1','item2')
term_cooccurrence_mn3=term_cooccurrence_mn3 %>% select(item1,item2,weight)
term_cooccurrence_mn3$weight=as.numeric(term_cooccurrence_mn3$weight)
###畫出負評Co-occurrence網路圖
g=term_cooccurrence_mn3 %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
nnode_name$V2=NA
nnode_name$V2[1:8]='#00DD00'
nnode_name$V2[9:15]='#FFAA33'
nnode_name$V2[16:21]='#EEEE00'
V(g)$color=sapply(names(V(g)), function(v){
nnode_name$V2[nnode_name$V1==v]
})
set.seed(0525)
layout23 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

對booking做飯店分類,並斷詞
bht$Review=as.character(bht$Review)
tidybookbht = bht %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
mutate(Id = group_indices(., HotelName)) %>% select(HotelName,word,Id)
str(tidybookbht)
## 'data.frame': 8293 obs. of 3 variables:
## $ HotelName: chr "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" ...
## $ word : chr "服務" "人員" "態度" "傑出" ...
## $ Id : int 28 28 28 28 28 28 28 28 28 28 ...
head(tidybookbht)
## HotelName word Id
## 1 雅霖大飯店 服務 28
## 2 雅霖大飯店 人員 28
## 3 雅霖大飯店 態度 28
## 4 雅霖大飯店 傑出 28
## 5 雅霖大飯店 舒服 28
## 6 雅霖大飯店 好極了 28
計算飯店評語之間的Co-occurrence:
###取出現最多的前30個評價詞做node
tnode_name=fread(file = "c:/learning/mid/word1.txt", encoding='UTF-8',header=F)
###計算飯店評語之間的Co-occurrence:
term_cooccurrence_mht=tidybookbht %>%
filter(word %in% tnode_name$V1) %>%
pairwise_count(word, Id, sort = TRUE,diag=F)
term_cooccurrence_mht=as.data.frame(term_cooccurrence_mht)
###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mht)){
term_cooccurrence_mht[i, ] = sort(term_cooccurrence_mht[i,])
}
term_cooccurrence_mht=term_cooccurrence_mht[!duplicated(term_cooccurrence_mht),]
names(term_cooccurrence_mht)=c('weight','item1','item2')
term_cooccurrence_mht=term_cooccurrence_mht %>% select(item1,item2,weight)
term_cooccurrence_mht$weight=as.numeric(term_cooccurrence_mht$weight)
###畫出正評Co-occurrence網路圖
g=term_cooccurrence_mht %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
tnode_name$V2=NA
tnode_name$V2[1:8]='#00DD00'
tnode_name$V2[9:15]='#FFAA33'
tnode_name$V2[16:21]='#EEEE00'
V(g)$color=sapply(names(V(g)), function(v){
tnode_name$V2[tnode_name$V1==v]
})
set.seed(0525)
layout3 <- layout.fruchterman.reingold(g)
plot(g, layout=layout3, pt.cex=1, cex=.8)

對booking飯店分分類,並對負評斷詞
tidybookbhtn = bht %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% mutate(Id = group_indices(., HotelName)) %>% select(HotelName,wordn,Id)
str(tidybookbhtn)
## 'data.frame': 13111 obs. of 3 variables:
## $ HotelName: chr "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" ...
## $ wordn : chr "這次" "住宿" "陽台" "每天" ...
## $ Id : int 24 24 24 24 24 24 24 24 24 24 ...
head(tidybookbhtn)
## HotelName wordn Id
## 1 雅霖大飯店 這次 24
## 2 雅霖大飯店 住宿 24
## 3 雅霖大飯店 陽台 24
## 4 雅霖大飯店 每天 24
## 5 雅霖大飯店 早餐 24
## 6 雅霖大飯店 套餐 24
計算評語之間的Co-occurrence
###取出現最多的前30個評價詞做node
tnnode_name=fread(file = "c:/learning/mid/wordn1.txt", encoding='UTF-8',header=F)
term_cooccurrence_mhtn=tidybookbhtn %>%
filter(wordn %in% tnnode_name$V1) %>%
pairwise_count(wordn, Id, sort = TRUE,diag=F)
term_cooccurrence_mhtn=as.data.frame(term_cooccurrence_mhtn)
###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mhtn)){
term_cooccurrence_mhtn[i, ] = sort(term_cooccurrence_mhtn[i,])
}
term_cooccurrence_mhtn=term_cooccurrence_mhtn[!duplicated(term_cooccurrence_mhtn),]
names(term_cooccurrence_mhtn)=c('weight','item1','item2')
term_cooccurrence_mhtn=term_cooccurrence_mhtn %>% select(item1,item2,weight)
term_cooccurrence_mhtn$weight=as.numeric(term_cooccurrence_mhtn$weight)
###畫出負評Co-occurrence網路圖
g=term_cooccurrence_mhtn %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
tnnode_name$V2=NA
tnnode_name$V2[1:8]='#00DD00'
tnnode_name$V2[9:15]='#FFAA33'
tnnode_name$V2[16:21]='#EEEE00'
V(g)$color=sapply(names(V(g)), function(v){
tnnode_name$V2[tnnode_name$V1==v]
})
set.seed(0525)
layout4 <- layout.fruchterman.reingold(g)
plot(g, layout=layout4, pt.cex=1, cex=.8)
