require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(data.table)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
require(scales)
## Loading required package: scales
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(tidytext)
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(topicmodels)
library(readr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(widyr)
library(ggraph)
setwd("C:/learning/mid")
booking<- fread("booking_reviews.csv")
bhs<-(booking[grepl("宿|村|子|屋|墅|巷|舍|園|棧|house|home", booking$HotelName),])

bht<-(booking[grepl("店|館|中心|文旅", booking$HotelName),])

自訂user word及停用字並用結巴斷詞

jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")

book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

資料分群分析

對booking做民宿分類,並斷詞

tidybook = booking %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,word,Id)

tidybookbhs = booking %>% 
  unnest_tokens(word,Review,token= book_tokenizer) %>%
  count(HotelName,word) %>% 
  filter(word!='好極了') %>% 
  filter(word!='傑出') %>% 
  filter(word!='很棒') %>% 
  #filter(str_detect(word,regex("![a-z]"),ignore_case = TRUE)) %>% 
  rename(count=n) %>% 
  mutate(Id = group_indices(., HotelName)) 

將資料轉換為Document Term Matrix (DTM)

token= tidybookbhs 
tidybook_dtm <- token %>% cast_dtm(Id, word, count)
tidybook_dtm
## <<DocumentTermMatrix (documents: 227, terms: 3461)>>
## Non-/sparse entries: 11894/773753
## Sparsity           : 98%
## Maximal term length: 16
## Weighting          : term frequency (tf)
#得到257*11508的document term matrix

資料分群分析

尋找Topic的代表字

tidybook_lda <- LDA(tidybook_dtm, k = 2, control = list(seed = 1234))
# 看各群的常用詞彙
tidy(tidybook_lda, matrix = "beta") %>%
    group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

#查看ϕϕ matrix (topic * term), 從book_topics中可以得到特定主題生成特定詞彙的概率。
book_topics <- tidy(tidybook_lda, matrix = "beta")
# 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
book_topics
## # A tibble: 6,922 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     1 80    0.000257
##  2     2 80    0.000117
##  3     1 nice  0.00483 
##  4     2 nice  0.00288 
##  5     1 小孩  0.000206
##  6     2 小孩  0.000751
##  7     1 不太  0.000257
##  8     2 不太  0.000117
##  9     1 不錯  0.00863 
## 10     2 不錯  0.00962 
## # ... with 6,912 more rows
#尋找Topic的代表字
book_top_terms <- book_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


book_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

book_top_terms
## # A tibble: 20 x 3
##    topic term    beta
##    <int> <chr>  <dbl>
##  1     1 令人  0.0257
##  2     1 民宿  0.0234
##  3     1 愉悅  0.0205
##  4     1 老闆  0.0189
##  5     1 舒適  0.0182
##  6     1 親切  0.0162
##  7     1 服務  0.0151
##  8     1 房間  0.0147
##  9     1 住宿  0.0138
## 10     1 乾淨  0.0134
## 11     2 令人  0.0257
## 12     2 民宿  0.0224
## 13     2 愉悅  0.0211
## 14     2 老闆  0.0166
## 15     2 房間  0.0160
## 16     2 尚可  0.0146
## 17     2 舒適  0.0134
## 18     2 乾淨  0.0129
## 19     2 服務  0.0126
## 20     2 住宿  0.0112

計算評語之間的Co-occurrence:

node_name= fread(file = "c:/learning/mid/word.txt", encoding='UTF-8',header=F)


term_cooccurrence_m=tidybookbhs %>%
  filter(word  %in% node_name$V1) %>%   
  pairwise_count(word, Id, sort = TRUE,diag=F)



term_cooccurrence_m=as.data.frame(term_cooccurrence_m)

移除重複的pairwise:

for (i in 1:nrow(term_cooccurrence_m)){
    term_cooccurrence_m[i, ] = sort(term_cooccurrence_m[i,])
}

term_cooccurrence_m=term_cooccurrence_m[!duplicated(term_cooccurrence_m),]
names(term_cooccurrence_m)=c('weight','item1','item2')
term_cooccurrence_m=term_cooccurrence_m %>%  select(item1,item2,weight)
term_cooccurrence_m$weight=as.numeric(term_cooccurrence_m$weight)
head(term_cooccurrence_m)
##    item1 item2 weight
## 1   令人  愉悅     90
## 3   乾淨  舒適     85
## 5   房間  舒適     84
## 7   房間  乾淨     82
## 9   民宿  老闆     80
## 11  住宿  舒適     80

畫出Co-occurrence網路圖

g=term_cooccurrence_m %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:7]='#00DD00'
node_name$V2[8:15]='#FFAA33'
node_name$V2[16:23]='#EEEE00'
node_name$V2[24:30]='#ff00dd'

V(g)$color=sapply(names(V(g)), function(v){
  node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

對booking做民宿分類,並斷詞

tidybookbhsn = bhs %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>%  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,wordn,Id)

str(tidybookbhsn)
## 'data.frame':    10834 obs. of  3 variables:
##  $ HotelName: chr  "天空格子商旅" "天空格子商旅" "天空格子商旅" "天空格子商旅" ...
##  $ wordn    : chr  "枕頭" "支撐" "不足" "洗手台" ...
##  $ Id       : int  16 16 16 16 16 16 16 16 16 16 ...
head(tidybookbhsn)
##      HotelName  wordn Id
## 1 天空格子商旅   枕頭 16
## 2 天空格子商旅   支撐 16
## 3 天空格子商旅   不足 16
## 4 天空格子商旅 洗手台 16
## 5 天空格子商旅   浴室 16
## 6 天空格子商旅   澗水 16

nnode_name=fread(file = "c:/learning/mid/wordn.txt", encoding='UTF-8',header=F)

tidybookbhsn = bhs %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>%  mutate(Id = group_indices(., HotelName))  %>%  select(HotelName,wordn,Id)

###計算評語之間的Co-occurrence
term_cooccurrence_mn=tidybookbhsn %>%
  filter(wordn  %in% nnode_name$V1) %>%   
  pairwise_count(wordn, Id, sort = TRUE,diag=F)

term_cooccurrence_mn=as.data.frame(term_cooccurrence_mn)

###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mn)){
    term_cooccurrence_mn[i, ] = sort(term_cooccurrence_mn[i,])
}

term_cooccurrence_mn=term_cooccurrence_mn[!duplicated(term_cooccurrence_mn),]
names(term_cooccurrence_mn)=c('weight','item1','item2')
term_cooccurrence_mn=term_cooccurrence_mn %>%  select(item1,item2,weight)
term_cooccurrence_mn$weight=as.numeric(term_cooccurrence_mn$weight)

###畫出負評Co-occurrence網路圖
g=term_cooccurrence_mn %>% graph_from_data_frame(directed = F) 
 # set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)

node_name$V2=NA
nnode_name$V2[1:7]='#00DD00'
## Warning in `[<-.data.table`(x, j = name, value = value): Supplied 7 items
## to be assigned to 31 items of column 'V2' (recycled leaving remainder of 3
## items).
nnode_name$V2[8:15]='#FFAA33'
nnode_name$V2[16:23]='#EEEE00'
nnode_name$V2[24:30]='#ff00dd'


V(g)$color=sapply(names(V(g)), function(v){
  nnode_name$V2[nnode_name$V1==v]
})


set.seed(0525)
layout2 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

以co-occurrence作為權重

egam <-(log(E(g)$weight)) / max(log(E(g)$weight)) *6

E(g)$width <- egam

plot(g, layout=layout1)
legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

以Degree作為頂點大小

degree(g, mode="all")
## 有點 民宿 早餐 房間 方便 比較 市區 老闆 不好 住宿 位置 一點 浴室 地方 希望 
##   30   30   30   30   30   30   30   30   30   30   30   30   30   30   30 
## 入住 建議 地點 更好 冷氣 房客 提供 附近 服務 隔音 廁所 空間 感覺 距離 需要 
##   30   30   30   30   30   30   30   30   30   30   30   30   30   30   30 
## 覺得 
##   30
deg <- degree(g, mode="all")

plot(g, vertex.size=deg*1.2)
legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

以Closeness作為頂點大小

closeness(g, mode="all", weights=NA, normalized=T)
## 有點 民宿 早餐 房間 方便 比較 市區 老闆 不好 住宿 位置 一點 浴室 地方 希望 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 入住 建議 地點 更好 冷氣 房客 提供 附近 服務 隔音 廁所 空間 感覺 距離 需要 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 覺得 
##    1
deg <- closeness(g, mode="all" , weights=NA, normalized=T)

plot(g, vertex.size=deg*20)

legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

以betweenness作為頂點大小

betweenness(g, directed=F, weights=NA, normalized = T)
## 有點 民宿 早餐 房間 方便 比較 市區 老闆 不好 住宿 位置 一點 浴室 地方 希望 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
## 入住 建議 地點 更好 冷氣 房客 提供 附近 服務 隔音 廁所 空間 感覺 距離 需要 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
## 覺得 
##    0
deg <- betweenness(g, directed=F, weights=NA, normalized = T)

plot(g, vertex.size=deg*1000)

legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)