require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(scales)
## Loading required package: scales
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(tidytext)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(topicmodels)
library(readr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(widyr)
library(ggraph)
setwd("C:/learning/mid")
booking<- fread("booking_reviews.csv")
bhs<-(booking[grepl("宿|村|子|屋|墅|巷|舍|園|棧|house|home", booking$HotelName),])
bht<-(booking[grepl("店|館|中心|文旅", booking$HotelName),])
自訂user word及停用字並用結巴斷詞
jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")
book_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
資料分群分析
對booking做民宿分類,並斷詞
tidybook = booking %>% unnest_tokens(word,Review,token= book_tokenizer) %>%
mutate(Id = group_indices(., HotelName)) %>% select(HotelName,word,Id)
tidybookbhs = booking %>%
unnest_tokens(word,Review,token= book_tokenizer) %>%
count(HotelName,word) %>%
filter(word!='好極了') %>%
filter(word!='傑出') %>%
filter(word!='很棒') %>%
#filter(str_detect(word,regex("![a-z]"),ignore_case = TRUE)) %>%
rename(count=n) %>%
mutate(Id = group_indices(., HotelName))
將資料轉換為Document Term Matrix (DTM)
token= tidybookbhs
tidybook_dtm <- token %>% cast_dtm(Id, word, count)
tidybook_dtm
## <<DocumentTermMatrix (documents: 227, terms: 3461)>>
## Non-/sparse entries: 11894/773753
## Sparsity : 98%
## Maximal term length: 16
## Weighting : term frequency (tf)
#得到257*11508的document term matrix
資料分群分析
尋找Topic的代表字
tidybook_lda <- LDA(tidybook_dtm, k = 2, control = list(seed = 1234))
# 看各群的常用詞彙
tidy(tidybook_lda, matrix = "beta") %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()

#查看ϕϕ matrix (topic * term), 從book_topics中可以得到特定主題生成特定詞彙的概率。
book_topics <- tidy(tidybook_lda, matrix = "beta")
# 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
book_topics
## # A tibble: 6,922 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 80 0.000257
## 2 2 80 0.000117
## 3 1 nice 0.00483
## 4 2 nice 0.00288
## 5 1 小孩 0.000206
## 6 2 小孩 0.000751
## 7 1 不太 0.000257
## 8 2 不太 0.000117
## 9 1 不錯 0.00863
## 10 2 不錯 0.00962
## # ... with 6,912 more rows
#尋找Topic的代表字
book_top_terms <- book_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
book_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()

book_top_terms
## # A tibble: 20 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 令人 0.0257
## 2 1 民宿 0.0234
## 3 1 愉悅 0.0205
## 4 1 老闆 0.0189
## 5 1 舒適 0.0182
## 6 1 親切 0.0162
## 7 1 服務 0.0151
## 8 1 房間 0.0147
## 9 1 住宿 0.0138
## 10 1 乾淨 0.0134
## 11 2 令人 0.0257
## 12 2 民宿 0.0224
## 13 2 愉悅 0.0211
## 14 2 老闆 0.0166
## 15 2 房間 0.0160
## 16 2 尚可 0.0146
## 17 2 舒適 0.0134
## 18 2 乾淨 0.0129
## 19 2 服務 0.0126
## 20 2 住宿 0.0112
計算評語之間的Co-occurrence:
node_name= fread(file = "c:/learning/mid/word.txt", encoding='UTF-8',header=F)
term_cooccurrence_m=tidybookbhs %>%
filter(word %in% node_name$V1) %>%
pairwise_count(word, Id, sort = TRUE,diag=F)
term_cooccurrence_m=as.data.frame(term_cooccurrence_m)
移除重複的pairwise:
for (i in 1:nrow(term_cooccurrence_m)){
term_cooccurrence_m[i, ] = sort(term_cooccurrence_m[i,])
}
term_cooccurrence_m=term_cooccurrence_m[!duplicated(term_cooccurrence_m),]
names(term_cooccurrence_m)=c('weight','item1','item2')
term_cooccurrence_m=term_cooccurrence_m %>% select(item1,item2,weight)
term_cooccurrence_m$weight=as.numeric(term_cooccurrence_m$weight)
head(term_cooccurrence_m)
## item1 item2 weight
## 1 令人 愉悅 90
## 3 乾淨 舒適 85
## 5 房間 舒適 84
## 7 房間 乾淨 82
## 9 民宿 老闆 80
## 11 住宿 舒適 80
畫出Co-occurrence網路圖
g=term_cooccurrence_m %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
node_name$V2[1:7]='#00DD00'
node_name$V2[8:15]='#FFAA33'
node_name$V2[16:23]='#EEEE00'
node_name$V2[24:30]='#ff00dd'
V(g)$color=sapply(names(V(g)), function(v){
node_name$V2[node_name$V1==v]
})
set.seed(0525)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

對booking做民宿分類,並斷詞
tidybookbhsn = bhs %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% mutate(Id = group_indices(., HotelName)) %>% select(HotelName,wordn,Id)
str(tidybookbhsn)
## 'data.frame': 10834 obs. of 3 variables:
## $ HotelName: chr "天空格子商旅" "天空格子商旅" "天空格子商旅" "天空格子商旅" ...
## $ wordn : chr "枕頭" "支撐" "不足" "洗手台" ...
## $ Id : int 16 16 16 16 16 16 16 16 16 16 ...
head(tidybookbhsn)
## HotelName wordn Id
## 1 天空格子商旅 枕頭 16
## 2 天空格子商旅 支撐 16
## 3 天空格子商旅 不足 16
## 4 天空格子商旅 洗手台 16
## 5 天空格子商旅 浴室 16
## 6 天空格子商旅 澗水 16
nnode_name=fread(file = "c:/learning/mid/wordn.txt", encoding='UTF-8',header=F)
tidybookbhsn = bhs %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% mutate(Id = group_indices(., HotelName)) %>% select(HotelName,wordn,Id)
###計算評語之間的Co-occurrence
term_cooccurrence_mn=tidybookbhsn %>%
filter(wordn %in% nnode_name$V1) %>%
pairwise_count(wordn, Id, sort = TRUE,diag=F)
term_cooccurrence_mn=as.data.frame(term_cooccurrence_mn)
###移除重複的pairwise
for (i in 1:nrow(term_cooccurrence_mn)){
term_cooccurrence_mn[i, ] = sort(term_cooccurrence_mn[i,])
}
term_cooccurrence_mn=term_cooccurrence_mn[!duplicated(term_cooccurrence_mn),]
names(term_cooccurrence_mn)=c('weight','item1','item2')
term_cooccurrence_mn=term_cooccurrence_mn %>% select(item1,item2,weight)
term_cooccurrence_mn$weight=as.numeric(term_cooccurrence_mn$weight)
###畫出負評Co-occurrence網路圖
g=term_cooccurrence_mn %>% graph_from_data_frame(directed = F)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
node_name$V2=NA
nnode_name$V2[1:7]='#00DD00'
## Warning in `[<-.data.table`(x, j = name, value = value): Supplied 7 items
## to be assigned to 31 items of column 'V2' (recycled leaving remainder of 3
## items).
nnode_name$V2[8:15]='#FFAA33'
nnode_name$V2[16:23]='#EEEE00'
nnode_name$V2[24:30]='#ff00dd'
V(g)$color=sapply(names(V(g)), function(v){
nnode_name$V2[nnode_name$V1==v]
})
set.seed(0525)
layout2 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1, pt.cex=1, cex=.8)

以co-occurrence作為權重
egam <-(log(E(g)$weight)) / max(log(E(g)$weight)) *6
E(g)$width <- egam
plot(g, layout=layout1)
legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

以Degree作為頂點大小
degree(g, mode="all")
## 有點 民宿 早餐 房間 方便 比較 市區 老闆 不好 住宿 位置 一點 浴室 地方 希望
## 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30
## 入住 建議 地點 更好 冷氣 房客 提供 附近 服務 隔音 廁所 空間 感覺 距離 需要
## 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30
## 覺得
## 30
deg <- degree(g, mode="all")
plot(g, vertex.size=deg*1.2)
legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

以Closeness作為頂點大小
closeness(g, mode="all", weights=NA, normalized=T)
## 有點 民宿 早餐 房間 方便 比較 市區 老闆 不好 住宿 位置 一點 浴室 地方 希望
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 入住 建議 地點 更好 冷氣 房客 提供 附近 服務 隔音 廁所 空間 感覺 距離 需要
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 覺得
## 1
deg <- closeness(g, mode="all" , weights=NA, normalized=T)
plot(g, vertex.size=deg*20)
legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)

以betweenness作為頂點大小
betweenness(g, directed=F, weights=NA, normalized = T)
## 有點 民宿 早餐 房間 方便 比較 市區 老闆 不好 住宿 位置 一點 浴室 地方 希望
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 入住 建議 地點 更好 冷氣 房客 提供 附近 服務 隔音 廁所 空間 感覺 距離 需要
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 覺得
## 0
deg <- betweenness(g, directed=F, weights=NA, normalized = T)
plot(g, vertex.size=deg*1000)
legend("bottomright", c('hs1','hs2','hs3','hs4'), pch=21,
col="#777777", pt.bg=c("#FFAA33","#00DD00","#EEEE00"), pt.cex=1, cex=.8)
