require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(scales)
## Loading required package: scales
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(tidytext)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(topicmodels)
library(readr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(widyr)
library(ggraph)
setwd("C:/learning/mid")
bh <- fread("booking_hotels.csv")
booking<- fread("booking_reviews.csv")
bhs<-(booking[grepl("宿|村|子|屋|墅|巷|舍|園|棧|house|home", booking$HotelName),])
自訂user word及停用字並用結巴斷詞
jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")
book_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
資料分群分析
對booking做民宿分類,並斷詞
tidybook = booking %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
mutate(Id = group_indices(., HotelName)) %>% select(HotelName,word,Id)
str(tidybook)
## 'data.frame': 22625 obs. of 3 variables:
## $ HotelName: chr "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" ...
## $ word : chr "服務" "人員" "態度" "傑出" ...
## $ Id : int 172 172 172 172 172 172 172 172 172 172 ...
head(tidybook)
bhs$Review=as.character(bhs$Review)
tidybookbhs = bhs %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
mutate(Id = group_indices(., HotelName)) %>% select(HotelName,word,Id)
str(tidybookbhs)
## 'data.frame': 12582 obs. of 3 variables:
## $ HotelName: chr "天空格子商旅" "天空格子商旅" "天空格子商旅" "天空格子商旅" ...
## $ word : chr "hen" "棒棒" "乾淨" "新穎" ...
## $ Id : int 18 18 18 18 18 18 18 18 18 18 ...
head(tidybookbhs)
計算評語之間的Co-occurrence:
node_name=fread(file = "c:/learning/mid/word.txt", encoding='UTF-8',header=F)
將民宿分三群
bhs1 <- (bhs[grepl("10", bhs$Rate),])
bhs2 <- filter(bhs, Rate > 8.8 & Rate <10)
bhs3 <- filter(bhs, Rate <= 8.8)
評價10的民宿
bhs1$Review=as.character(bhs1$Review)
tidybookbhs1 = bhs1 %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
mutate(Id = group_indices(., HotelName)) %>% select(HotelName,word,Id)
str(tidybookbhs1)
## 'data.frame': 6478 obs. of 3 variables:
## $ HotelName: chr "天空格子商旅" "天空格子商旅" "天空格子商旅" "天空格子商旅" ...
## $ word : chr "hen" "棒棒" "傑出" "傑出" ...
## $ Id : int 17 17 17 17 17 17 17 17 17 17 ...
head(tidybookbhs1)
計算評語之間的Co-occurrence:
term_cooccurrence_m1=tidybookbhs1 %>%
filter(word %in% node_name$V1) %>%
pairwise_count(word, Id, sort = TRUE,diag=F)
term_cooccurrence_m1=as.data.frame(term_cooccurrence_m1)
移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_m1)){
term_cooccurrence_m1[i, ] = sort(term_cooccurrence_m1[i,])
}
term_cooccurrence_m1=term_cooccurrence_m1[!duplicated(term_cooccurrence_m1),]
names(term_cooccurrence_m1)=c('weight','item1','item2')
term_cooccurrence_m1=term_cooccurrence_m1 %>% select(item1,item2,weight)
term_cooccurrence_m1$weight=as.numeric(term_cooccurrence_m1$weight)
畫出Co-occurrence網路圖
g=term_cooccurrence_m1 %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:7]='#00DD00'
node_name$V2[8:15]='#FFAA33'
node_name$V2[16:23]='#EEEE00'
node_name$V2[24:30]='#ff00dd'
V(g)$color=sapply(names(V(g)), function(v){
node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout11 <- layout.fruchterman.reingold(g)
plot(g, layout=layout11, pt.cex=1, cex=.8)

以Degree作為頂點大小
degree(g, mode="all")
## 民宿 老闆 傑出 乾淨 下次 房間 住宿 推薦 很棒 舒適
## 27 27 27 26 27 26 26 26 27 26
## 值得 服務 老闆娘 澎湖 親切 入住 行程 地點 舒服 早餐
## 25 27 26 26 26 27 26 25 26 25
## 貼心 方便 熱心 感覺 不錯 令人 好極了 環境
## 27 26 26 26 25 21 10 27
deg <- degree(g, mode="all")
plot(g, vertex.size=deg*1.2)
legend("bottomright", c('hs1','hs2','hs3'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

以Closeness作為頂點大小
closeness(g, mode="all", weights=NA, normalized=T)
## 民宿 老闆 傑出 乾淨 下次 房間 住宿
## 1.0000000 1.0000000 1.0000000 0.9642857 1.0000000 0.9642857 0.9642857
## 推薦 很棒 舒適 值得 服務 老闆娘 澎湖
## 0.9642857 1.0000000 0.9642857 0.9310345 1.0000000 0.9642857 0.9642857
## 親切 入住 行程 地點 舒服 早餐 貼心
## 0.9642857 1.0000000 0.9642857 0.9310345 0.9642857 0.9310345 1.0000000
## 方便 熱心 感覺 不錯 令人 好極了 環境
## 0.9642857 0.9642857 0.9642857 0.9310345 0.8181818 0.6136364 1.0000000
deg <- closeness(g, mode="all" , weights=NA, normalized=T)
plot(g, vertex.size=deg*20)
legend("bottomright", c('hs1','hs2','hs3'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

以betweenness作為頂點大小
betweenness(g, directed=F, weights=NA, normalized = T)
## 民宿 老闆 傑出 乾淨 下次 房間
## 0.005553294 0.005553294 0.005553294 0.000678334 0.005553294 0.000678334
## 住宿 推薦 很棒 舒適 值得 服務
## 0.000678334 0.004558405 0.005553294 0.000678334 0.000000000 0.005553294
## 老闆娘 澎湖 親切 入住 行程 地點
## 0.000678334 0.000678334 0.000678334 0.005553294 0.000678334 0.000000000
## 舒服 早餐 貼心 方便 熱心 感覺
## 0.000678334 0.000000000 0.005553294 0.000678334 0.000678334 0.000678334
## 不錯 令人 好極了 環境
## 0.000000000 0.000000000 0.000000000 0.005553294
deg <- betweenness(g, directed=F, weights=NA, normalized = T)
plot(g, vertex.size=deg*1000)
legend("bottomright", c('hs1','hs2','hs3'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)
