四.解釋分析的結果(視覺化和結論)
載入套件
setwd("C:/learning/mid")
require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(scales)
## Loading required package: scales
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
載入檔案
ah <- fread("agoda_hotels.csv")
agoda<- fread("agoda_reviews.csv")
bh <- fread("booking_hotels.csv")
booking<- fread("booking_reviews.csv")
初步整理資料檔,ah和bh皆只剩下一個Name欄位
ah<-ah[,-c(1,2)]
bh<-bh[,-1]
分析agoda飯店民宿比率(電腦做的不準確,若要百分百準確需以人工來處理)
agoda民宿65家,飯店26家,booking民宿253家,飯店58家
a_homestay_count<-ah$Name %>% str_detect(regex("宿|村|子|屋|墅|巷|舍|園|棧|house|home")) %>% sum()
a_hotel_count<-ah$Name %>% str_detect(regex("店|館|中心|文旅")) %>% sum()
b_homestay_count<-bh$Name %>% str_detect(regex("宿|村|子|屋|墅|巷|舍|園|棧|house|home")) %>% sum()
b_hotel_count<-bh$Name %>% str_detect(regex("店|館|中心|文旅")) %>% sum()
合併agoda和booking,並經由人工檢查是否有想同飯店使用不同名稱
結果全部業者共366家,民宿284家,飯店64家
th<-rbind(ah,bh)
th<-th %>% distinct() %>% arrange(desc(Name))
t_homestay_count<-th$Name %>% str_detect(regex("宿|村|子|屋|墅|巷|舍|園|棧|house|home")) %>% sum()
t_hotel_count<-th$Name %>% str_detect(regex("店|館|中心|文旅")) %>% sum()
將booking及agoda合併
booking <- select(booking, HotelName, Country, Rate, Review, ReviewNeg)%>%
mutate(web = "bk")
agoda<- select(agoda, HotelName, Country, Rate, Review, ReviewNeg)%>%
mutate(web = "ag")
chr<- rbind(booking,agoda)
計算agoda/booking/合併,各家業者之評分平均值,並排序
可以看出民宿的評比分數皆比較好,優於飯店的評比
a_rank<-aggregate(Rate~HotelName,agoda,mean) %>% arrange(-Rate)
b_rank<-aggregate(Rate~HotelName,booking,mean) %>% arrange(-Rate)
T_rank<-aggregate(Rate~HotelName,chr,mean) %>% arrange(-Rate)
自訂user word及停用字並用結巴斷詞
jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")
book_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
對booking的總評語做token化
tidybook = booking %>% unnest_tokens(word,Review,token= book_tokenizer) %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% select(HotelName,Country,Rate,word,wordn)
head(tidybook)
## HotelName Country Rate word wordn
## 1 雅霖大飯店 臺灣 10 服務 這次
## 1.3 雅霖大飯店 臺灣 10 服務 住宿
## 1.4 雅霖大飯店 臺灣 10 服務 陽台
## 1.1 雅霖大飯店 臺灣 10 人員 這次
## 1.1.1 雅霖大飯店 臺灣 10 人員 住宿
## 1.1.2 雅霖大飯店 臺灣 10 人員 陽台
tidybook1 = booking %>% unnest_tokens(word,Review,token= book_tokenizer) %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% select(HotelName,Country,Rate,word,wordn)
head(tidybook1)
## HotelName Country Rate word wordn
## 1 雅霖大飯店 臺灣 10 服務 這次
## 1.3 雅霖大飯店 臺灣 10 服務 住宿
## 1.4 雅霖大飯店 臺灣 10 服務 陽台
## 1.1 雅霖大飯店 臺灣 10 人員 這次
## 1.1.1 雅霖大飯店 臺灣 10 人員 住宿
## 1.1.2 雅霖大飯店 臺灣 10 人員 陽台
製作總評語長條圖
tidybook %>%
count(word, sort = TRUE) %>%
filter(n > 500) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
ylab("出現次數") +
coord_flip()

製作總評語文字雲
tokens_count <- tidybook %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>50) %>%
arrange(desc(sum))
head(tokens_count)
## # A tibble: 6 x 2
## word sum
## <chr> <int>
## 1 傑出 2433
## 2 令人 1893
## 3 很棒 1724
## 4 民宿 1262
## 5 愉悅 1149
## 6 好極了 1113
tokens_count %>% wordcloud2()
製作負評語長條圖
tidybook1 %>%
count(wordn, sort = TRUE) %>%
filter(n > 300) %>%
mutate(wordn = reorder(wordn, n)) %>%
ggplot(aes(wordn, n)) +
geom_col() +
xlab(NULL) +
ylab("出現次數") +
coord_flip()

增加情緒字典
liwc_p <- read_file("C:/learning/hw2/dict/liwc/positive.txt")
liwc_n <- read_file("C:/learning/hw2/dict/liwc/negative.txt")
liwc_p1 <- read_file("C:/learning/hw2/dict/liwc/positive1.txt")
liwc_n1 <- read_file("C:/learning/hw2/dict/liwc/negative1.txt")
#切出LIWC情緒的正負評價,製作對照表
positive <- strsplit(liwc_p, "[,]")[[1]]
negative <- strsplit(liwc_n, "[,]")[[1]]
positive1 <- strsplit(liwc_p1, "[,]")[[1]]
negative1 <- strsplit(liwc_n1, "[,]")[[1]]
positive <- data.frame(word = positive, sentiment = "positive",stringsAsFactors = F)
negative <- data.frame(word = negative, sentiment = "negative",stringsAsFactors = F)
positive1 <- data.frame(wordn = positive1, sentiment = "positive",stringsAsFactors = F)
negative1 <- data.frame(wordn = negative1, sentiment = "negative",stringsAsFactors = F)
LIWC_ch = rbind(positive,negative)
head(LIWC_ch)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
LIWC_ch1 = rbind(positive1,negative1)
head(LIWC_ch1)
## wordn sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
總評語正負情緒對照文字雲
tokens_count %>%
inner_join(LIWC_ch) %>%
select(word,sentiment,sum) %>%
acast(word ~ sentiment,value.var = "sum", fill = 0) %>%
wordcloud::comparison.cloud(random.order=FALSE,colors = c("gray80", "indianred3"),max.words = 108)
## Joining, by = "word"

負評語正負情緒對照文字雲
tokens_count1 <- tidybook1 %>%
filter(nchar(.$wordn)>1) %>%
group_by(wordn) %>%
summarise(sum = n()) %>%
filter(sum>50) %>%
arrange(desc(sum))
head(tokens_count1)
## # A tibble: 6 x 2
## wordn sum
## <chr> <int>
## 1 the 1611
## 2 房間 1534
## 3 早餐 1002
## 4 有點 938
## 5 to 696
## 6 提供 677
tokens_count1 %>%
inner_join(LIWC_ch1) %>%
select(wordn,sentiment,sum) %>%
acast(wordn ~ sentiment,value.var = "sum", fill = 0) %>%
wordcloud::comparison.cloud(random.order=FALSE,colors = c("gray80", "indianred3"),max.words = 108)
## Joining, by = "wordn"

總評語各飯店情緒差值
bt_sentiment <-tidybook %>%
inner_join(LIWC_ch) %>%
count(HotelName = HotelName, sentiment)%>%
spread(sentiment, n, fill = 0) %>%
mutate(sentimentx = positive - negative)
## Joining, by = "word"
head(bt_sentiment)
## # A tibble: 6 x 4
## HotelName negative positive sentimentx
## <chr> <dbl> <dbl> <dbl>
## 1 ?樂民宿 18 215 197
## 2 180度沙灘最前線民宿 0 5 5
## 3 198 紅帽民宿 0 37 37
## 4 2巷9號海景民宿 0 4 4
## 5 35民宿 0 9 9
## 6 525民宿 0 134 134
總評語各飯店情緒差值圖形化
ggplot(bt_sentiment, aes(HotelName, sentimentx,fill = HotelName)) +
geom_col(show.legend = FALSE,width = 0.8) +
xlab("飯店")+
ylab("情緒差值")

負評語各飯店情緒差值圖形化
bn_sentiment <-tidybook1 %>%
inner_join(LIWC_ch1) %>%
count(HotelName = HotelName, sentiment)%>%
spread(sentiment, n, fill = 0) %>%
mutate(sentimentx = positive - negative)
## Joining, by = "wordn"
head(bn_sentiment)
## # A tibble: 6 x 4
## HotelName negative positive sentimentx
## <chr> <dbl> <dbl> <dbl>
## 1 ?樂民宿 136 37 -99
## 2 198 紅帽民宿 2 3 1
## 3 2巷9號海景民宿 3 3 0
## 4 35民宿 7 7 0
## 5 525民宿 0 25 25
## 6 7號旅店背包客棧 11 3 -8
負評語各飯店情緒差值圖形化
ggplot(bn_sentiment, aes(HotelName, sentimentx,fill = HotelName)) +
geom_col(show.legend = FALSE,width = 0.8) +
xlab("飯店")+
ylab("情緒差值")

總評語各情緒詞出現次數並遞減排序
word_count <- tidybook %>%
inner_join(LIWC_ch) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
word_count
## # A tibble: 145 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 傑出 positive 2433
## 2 很棒 positive 1724
## 3 愉悅 positive 1149
## 4 不錯 positive 975
## 5 失望 negative 810
## 6 舒適 positive 785
## 7 親切 positive 556
## 8 乾淨 positive 542
## 9 問題 negative 479
## 10 值得 positive 390
## # ... with 135 more rows
分別呈現總評語正負情緒詞出現次數前10名
word_count %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n

負評語各情緒詞出現次數並遞減排序
word_count1 <- tidybook1 %>%
inner_join(LIWC_ch1) %>%
count(wordn, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "wordn"
word_count1
## # A tibble: 219 x 3
## wordn sentiment n
## <chr> <chr> <int>
## 1 不好 negative 302
## 2 希望 positive 271
## 3 清楚 positive 191
## 4 問題 negative 173
## 5 乾淨 positive 172
## 6 改善 positive 148
## 7 容易 positive 139
## 8 不便 negative 137
## 9 喜歡 positive 132
## 10 不足 negative 122
## # ... with 209 more rows
負評語各正負情緒詞出現次數前10名
word_count1 %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(wordn = reorder(wordn, n)) %>%
ggplot(aes(wordn, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n

總評語-增加各斷詞出現的總數
com_word <- tidybook %>%
count(HotelName, word, sort = TRUE)
total_words <- com_word %>%
group_by(HotelName) %>%
summarize(total = sum(n))
com_word <- left_join(com_word, total_words)
## Joining, by = "HotelName"
com_word
## # A tibble: 5,890 x 4
## HotelName word n total
## <chr> <chr> <int> <int>
## 1 平湖窩行旅 規劃 250 7944
## 2 澎湖安一海景大飯店 hotel 224 6891
## 3 平湖窩行旅 不錯 212 7944
## 4 平湖窩行旅 質感 196 7944
## 5 百勝民宿 was 195 1756
## 6 豐谷大飯店 ok 188 1663
## 7 豐谷大飯店 卡拉 188 1663
## 8 日立大飯店 令人 187 6696
## 9 平湖窩行旅 整體 183 7944
## 10 寶華大飯店 hotel 182 4451
## # ... with 5,880 more rows
總評語-增加rank及詞性頻率欄位
freq_by_rank <- com_word %>%
group_by(HotelName) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
## # A tibble: 5,890 x 6
## # Groups: HotelName [199]
## HotelName word n total rank `term frequency`
## <chr> <chr> <int> <int> <int> <dbl>
## 1 平湖窩行旅 規劃 250 7944 1 0.0315
## 2 澎湖安一海景大飯店 hotel 224 6891 1 0.0325
## 3 平湖窩行旅 不錯 212 7944 2 0.0267
## 4 平湖窩行旅 質感 196 7944 3 0.0247
## 5 百勝民宿 was 195 1756 1 0.111
## 6 豐谷大飯店 ok 188 1663 1 0.113
## 7 豐谷大飯店 卡拉 188 1663 2 0.113
## 8 日立大飯店 令人 187 6696 1 0.0279
## 9 平湖窩行旅 整體 183 7944 4 0.0230
## 10 寶華大飯店 hotel 182 4451 1 0.0409
## # ... with 5,880 more rows
畫出rank及詞性頻率的關係線圖
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color =HotelName)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()

examine Zipf’s law
rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.07228 -1.32710
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = HotelName)) +
geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()

總評語bind_tf_idf
雅霖大飯店/澎澄飯店/豐谷大飯店/平湖窩行旅
bt_words1 <- com_word %>%
bind_tf_idf(word, HotelName, n)
bt_words3<- (bt_words1[grepl("雅霖大飯店",bt_words1$HotelName),])
bt_words2<- (bt_words1[grepl("澎澄飯店" ,bt_words1$HotelName),])
bt_words4<- (bt_words1[grepl("豐谷大飯店" ,bt_words1$HotelName),])
bt_words5<- (bt_words1[grepl("平湖窩行旅" ,bt_words1$HotelName),])
bt_words<-rbind(bt_words2,bt_words3,bt_words5,bt_words4)
bt_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 421 x 6
## HotelName word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 豐谷大飯店 卡拉 188 0.113 5.29 0.598
## 2 豐谷大飯店 ok 188 0.113 3.50 0.396
## 3 豐谷大飯店 對面 94 0.0565 5.29 0.299
## 4 豐谷大飯店 整晚 94 0.0565 4.60 0.260
## 5 豐谷大飯店 太吵 94 0.0565 4.19 0.237
## 6 雅霖大飯店 low 71 0.0437 5.29 0.231
## 7 豐谷大飯店 噪音 94 0.0565 3.68 0.208
## 8 雅霖大飯店 services 71 0.0437 4.60 0.201
## 9 豐谷大飯店 解決 94 0.0565 3.50 0.198
## 10 豐谷大飯店 實惠 94 0.0565 3.50 0.198
## # ... with 411 more rows
總評語
這四家飯店前15個常出現的字詞畫視覺圖
bt_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(HotelName) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = HotelName)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~HotelName, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf

找出四家飯店的bigram
bt_bigrams <- booking %>%
unnest_tokens(bigram, Review, token = "ngrams", n = 2)
bt_bigrams1 <- (bt_bigrams[grepl("雅霖大飯店",bt_bigrams$HotelName),])
bt_bigrams2 <- (bt_bigrams[grepl("澎澄飯店",bt_bigrams$HotelName),])
bt_bigrams3 <- (bt_bigrams[grepl("豐谷大飯店",bt_bigrams$HotelName),])
bt_bigrams4 <- (bt_bigrams[grepl("平湖窩行旅",bt_bigrams$HotelName),])
bt_bigrams<-rbind(bt_bigrams1,bt_bigrams2, bt_bigrams3, bt_bigrams4)
bt_bigrams %>% na.omit(bt_bigrams) %>%
count(bigram, sort = TRUE)
## # A tibble: 1,141 x 2
## bigram n
## <chr> <int>
## 1 傑出 傑出 208
## 2 好 好 71
## 3 非常 好 56
## 4 很 棒 56
## 5 好極了 好極了 42
## 6 令人 愉悅 29
## 7 好 非常 27
## 8 棒 很 25
## 9 入 住 10
## 10 愉悅 令人 10
## # ... with 1,131 more rows
依四家飯店資料整理bigram
bt_bigrams_separated <- bt_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bt_bigrams_filtered <- bt_bigrams_separated %>% na.omit(bt_bigram_counts) %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bt_bigram_counts <- bt_bigrams_filtered %>%
count(word1, word2, sort = TRUE) %>% na.omit(bt_bigram_counts)
bt_bigram_united <- bt_bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ") %>% na.omit(bt_bigram_united) %>%
select(HotelName,bigram)
bt_bigram_counts
## # A tibble: 1,048 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 傑出 傑出 208
## 2 好 好 71
## 3 非常 好 56
## 4 很 棒 56
## 5 好極了 好極了 42
## 6 令人 愉悅 29
## 7 好 非常 27
## 8 棒 很 25
## 9 入 住 10
## 10 愉悅 令人 10
## # ... with 1,038 more rows
常出現的前6種birgam組合
bt_bigram_tf_idf <- bt_bigram_united %>%
count(HotelName, bigram) %>%
bind_tf_idf(bigram, HotelName, n) %>%
arrange(desc(tf_idf))
bt_bigram_tf_idf
## # A tibble: 1,114 x 6
## HotelName bigram n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 豐谷大飯店 到 餐廳 2 0.0116 1.39 0.0161
## 2 豐谷大飯店 空 的 2 0.0116 1.39 0.0161
## 3 雅霖大飯店 去 澎湖 3 0.0102 1.39 0.0142
## 4 雅霖大飯店 雅 霖 3 0.0102 1.39 0.0142
## 5 雅霖大飯店 的 房間 2 0.00683 1.39 0.00946
## 6 雅霖大飯店 的 飯店 2 0.00683 1.39 0.00946
## 7 雅霖大飯店 霖 飯店 2 0.00683 1.39 0.00946
## 8 平湖窩行旅 好極了 好極了 33 0.0319 0.288 0.00918
## 9 豐谷大飯店 15 到 1 0.00581 1.39 0.00806
## 10 豐谷大飯店 2700 的 1 0.00581 1.39 0.00806
## # ... with 1,104 more rows
bt_bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(HotelName) %>%
top_n(6) %>%
ungroup() %>%
ggplot(aes(bigram, tf_idf, fill = HotelName)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~HotelName, ncol = 4, scales = "free") +
coord_flip()
## Selecting by tf_idf
