一.確立動機和分析目的

組員陳鈴俊在澎湖地區經營旅館業,澎湖旅遊市場尚處於成長期,每年蒞澎旅客人數逐年增加,從早期的50,60萬到去年的120萬人次(澎縣府統計),推測是交通運輸能量之成長造成(布袋到龍門航線).但因旅遊同業亦成長快速,縣府統計,合法旅館業者約50家,房間數3300間,合法民宿業者約700家,房間數約3200間,而非法營業者則無法估計.澎湖在地經營業者,在此紅海市場中,唯有知己知彼,加強本身經營專業能力,否則無法生存.為達知己知彼提升自我專業能力之目的,希望藉由網路訂房網站之顧客評論內容(Booking,Agoda,Airbnb為世界三大訂房網站),來分析澎湖地區各旅館民宿之優缺點,評價分數,常見之客訴問題排名,好評排名等等,以做為經營策略之決策依據.

二.尋找適當的資料集和資料集的描述.

1.Booking和Agoda兩網站較適合旅館業,而Airbnb只有民宿,較不適合,本研究不採用.

2.利用網路爬蟲,從Booking和Agoda網站,找到澎湖地區營業量較大之旅館和民宿的顧客評論做為分析的資料來源.

3.Agoda約100家,Booking約320家業者,皆為.json檔,經初步轉換成data.frame形式,存成.csv檔

4.Agoda_Hotel有Hotel_ID,Link,Name三欄位,共100家業者資料.

5.Booking_Hotel有Link,Name二欄位,共323家業者資料.

6.Agoda_reviews有HotelName,Country,Rate,Review,ReviewDate,ReviewNeg,ReviewPos七欄位,共2493筆資料錄.

7.Booking_reviews有HotelName,Country,Rate,Review,ReviewDate,ReviewNeg,ReviewPos七欄位,共12255筆資料錄.

三.分析資料

四.解釋分析的結果(視覺化和結論)

載入套件

setwd("C:/learning/mid")
require(ggplot2)

## Loading required package: ggplot2

require(dplyr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(data.table)

## Loading required package: data.table

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

require(scales)

## Loading required package: scales

library(tidytext)
library(jiebaR)

## Loading required package: jiebaRD

library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)

## Loading required package: RColorBrewer

library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)

## 
## Attaching package: 'readr'

## The following object is masked from 'package:scales':
## 
##     col_factor

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

載入檔案

ah <- fread("agoda_hotels.csv")
agoda<- fread("agoda_reviews.csv")
bh <- fread("booking_hotels.csv")
booking<- fread("booking_reviews.csv")

初步整理資料檔,ah和bh皆只剩下一個Name欄位

ah<-ah[,-c(1,2)]
bh<-bh[,-1]

分析agoda飯店民宿比率(電腦做的不準確,若要百分百準確需以人工來處理)

agoda民宿65家,飯店26家,booking民宿253家,飯店58家

a_homestay_count<-ah$Name %>% str_detect(regex("宿|村|子|屋|墅|巷|舍|園|棧|house|home")) %>% sum()
a_hotel_count<-ah$Name %>% str_detect(regex("店|館|中心|文旅")) %>% sum()
b_homestay_count<-bh$Name %>% str_detect(regex("宿|村|子|屋|墅|巷|舍|園|棧|house|home")) %>% sum()
b_hotel_count<-bh$Name %>% str_detect(regex("店|館|中心|文旅")) %>% sum()

合併agoda和booking,並經由人工檢查是否有想同飯店使用不同名稱

結果全部業者共366家,民宿284家,飯店64家

th<-rbind(ah,bh)
th<-th %>% distinct() %>% arrange(desc(Name))
t_homestay_count<-th$Name %>% str_detect(regex("宿|村|子|屋|墅|巷|舍|園|棧|house|home")) %>% sum()
t_hotel_count<-th$Name %>% str_detect(regex("店|館|中心|文旅")) %>% sum()

將booking及agoda合併

booking <- select(booking, HotelName, Country, Rate, Review, ReviewNeg)%>%
  mutate(web = "bk")
agoda<- select(agoda, HotelName, Country, Rate, Review, ReviewNeg)%>% 
  mutate(web = "ag")
chr<- rbind(booking,agoda)

計算agoda/booking/合併，各家業者之評分平均值，並排序

可以看出民宿的評比分數皆比較好，優於飯店的評比

a_rank<-aggregate(Rate~HotelName,agoda,mean) %>% arrange(-Rate)
b_rank<-aggregate(Rate~HotelName,booking,mean) %>% arrange(-Rate)
T_rank<-aggregate(Rate~HotelName,chr,mean) %>% arrange(-Rate)

自訂user word及停用字並用結巴斷詞

jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")

book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

對booking的總評語做token化

tidybook = booking %>% unnest_tokens(word,Review,token= book_tokenizer)  %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% select(HotelName,Country,Rate,word,wordn)


head(tidybook)

##        HotelName Country Rate word wordn
## 1     雅霖大飯店    臺灣   10 服務  這次
## 1.3   雅霖大飯店    臺灣   10 服務  住宿
## 1.4   雅霖大飯店    臺灣   10 服務  陽台
## 1.1   雅霖大飯店    臺灣   10 人員  這次
## 1.1.1 雅霖大飯店    臺灣   10 人員  住宿
## 1.1.2 雅霖大飯店    臺灣   10 人員  陽台

tidybook1 = booking %>% unnest_tokens(word,Review,token= book_tokenizer)  %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% select(HotelName,Country,Rate,word,wordn)


head(tidybook1)

##        HotelName Country Rate word wordn
## 1     雅霖大飯店    臺灣   10 服務  這次
## 1.3   雅霖大飯店    臺灣   10 服務  住宿
## 1.4   雅霖大飯店    臺灣   10 服務  陽台
## 1.1   雅霖大飯店    臺灣   10 人員  這次
## 1.1.1 雅霖大飯店    臺灣   10 人員  住宿
## 1.1.2 雅霖大飯店    臺灣   10 人員  陽台

製作總評語長條圖

tidybook %>%
  count(word, sort = TRUE) %>%
  filter(n > 500) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  ylab("出現次數") +
  coord_flip()

製作總評語文字雲

tokens_count <- tidybook %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>50) %>%
  arrange(desc(sum))
head(tokens_count)

## # A tibble: 6 x 2
##   word     sum
##   <chr>  <int>
## 1 傑出    2433
## 2 令人    1893
## 3 很棒    1724
## 4 民宿    1262
## 5 愉悅    1149
## 6 好極了  1113

tokens_count %>% wordcloud2()

製作負評語長條圖

tidybook1 %>%
  count(wordn, sort = TRUE) %>%
  filter(n > 300) %>%
  mutate(wordn = reorder(wordn, n)) %>%
  ggplot(aes(wordn, n)) +
  geom_col() +
  xlab(NULL) +
  ylab("出現次數") +
  coord_flip()

增加情緒字典

liwc_p <- read_file("C:/learning/hw2/dict/liwc/positive.txt")
liwc_n <- read_file("C:/learning/hw2/dict/liwc/negative.txt")
liwc_p1 <- read_file("C:/learning/hw2/dict/liwc/positive1.txt")
liwc_n1 <- read_file("C:/learning/hw2/dict/liwc/negative1.txt")
#切出LIWC情緒的正負評價，製作對照表
positive <- strsplit(liwc_p, "[,]")[[1]]
negative <- strsplit(liwc_n, "[,]")[[1]]
positive1 <- strsplit(liwc_p1, "[,]")[[1]]
negative1 <- strsplit(liwc_n1, "[,]")[[1]]
positive <- data.frame(word = positive, sentiment = "positive",stringsAsFactors = F)
negative <- data.frame(word = negative, sentiment = "negative",stringsAsFactors = F)
positive1 <- data.frame(wordn = positive1, sentiment = "positive",stringsAsFactors = F)
negative1 <- data.frame(wordn = negative1, sentiment = "negative",stringsAsFactors = F)

LIWC_ch = rbind(positive,negative)
head(LIWC_ch)

##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

LIWC_ch1 = rbind(positive1,negative1)
head(LIWC_ch1)

##      wordn sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

總評語正負情緒對照文字雲

tokens_count %>% 
  inner_join(LIWC_ch) %>%
  select(word,sentiment,sum) %>%
  acast(word ~ sentiment,value.var = "sum", fill = 0) %>% 
  wordcloud::comparison.cloud(random.order=FALSE,colors = c("gray80", "indianred3"),max.words = 108)

## Joining, by = "word"

負評語正負情緒對照文字雲

tokens_count1 <- tidybook1 %>% 
  filter(nchar(.$wordn)>1) %>%
  group_by(wordn) %>% 
  summarise(sum = n()) %>% 
  filter(sum>50) %>%
  arrange(desc(sum))
head(tokens_count1)

## # A tibble: 6 x 2
##   wordn   sum
##   <chr> <int>
## 1 the    1611
## 2 房間   1534
## 3 早餐   1002
## 4 有點    938
## 5 to      696
## 6 提供    677

tokens_count1 %>% 
  inner_join(LIWC_ch1) %>%
  select(wordn,sentiment,sum) %>%
  acast(wordn ~ sentiment,value.var = "sum", fill = 0) %>% 
  wordcloud::comparison.cloud(random.order=FALSE,colors = c("gray80", "indianred3"),max.words = 108)

## Joining, by = "wordn"

總評語各飯店情緒差值

bt_sentiment <-tidybook %>%
  inner_join(LIWC_ch) %>%
  count(HotelName = HotelName, sentiment)%>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentimentx = positive - negative)

## Joining, by = "word"

head(bt_sentiment)

## # A tibble: 6 x 4
##   HotelName           negative positive sentimentx
##   <chr>                  <dbl>    <dbl>      <dbl>
## 1 ?樂民宿                   18      215        197
## 2 180度沙灘最前線民宿        0        5          5
## 3 198 紅帽民宿               0       37         37
## 4 2巷9號海景民宿             0        4          4
## 5 35民宿                     0        9          9
## 6 525民宿                    0      134        134

總評語各飯店情緒差值圖形化

ggplot(bt_sentiment, aes(HotelName, sentimentx,fill = HotelName)) +
  geom_col(show.legend = FALSE,width = 0.8) +
  xlab("飯店")+
  ylab("情緒差值")

負評語各飯店情緒差值圖形化

bn_sentiment <-tidybook1 %>%
  inner_join(LIWC_ch1) %>%
  count(HotelName = HotelName, sentiment)%>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentimentx = positive - negative)

## Joining, by = "wordn"

head(bn_sentiment)

## # A tibble: 6 x 4
##   HotelName       negative positive sentimentx
##   <chr>              <dbl>    <dbl>      <dbl>
## 1 ?樂民宿              136       37        -99
## 2 198 紅帽民宿           2        3          1
## 3 2巷9號海景民宿         3        3          0
## 4 35民宿                 7        7          0
## 5 525民宿                0       25         25
## 6 7號旅店背包客棧       11        3         -8

負評語各飯店情緒差值圖形化

ggplot(bn_sentiment, aes(HotelName, sentimentx,fill = HotelName)) +
  geom_col(show.legend = FALSE,width = 0.8) +
  xlab("飯店")+
  ylab("情緒差值")

總評語各情緒詞出現次數並遞減排序

word_count <- tidybook %>%
  inner_join(LIWC_ch) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

word_count

## # A tibble: 145 x 3
##    word  sentiment     n
##    <chr> <chr>     <int>
##  1 傑出  positive   2433
##  2 很棒  positive   1724
##  3 愉悅  positive   1149
##  4 不錯  positive    975
##  5 失望  negative    810
##  6 舒適  positive    785
##  7 親切  positive    556
##  8 乾淨  positive    542
##  9 問題  negative    479
## 10 值得  positive    390
## # ... with 135 more rows

分別呈現總評語正負情緒詞出現次數前10名

word_count %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

負評語各情緒詞出現次數並遞減排序

word_count1 <- tidybook1 %>%
  inner_join(LIWC_ch1) %>%
  count(wordn, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "wordn"

word_count1

## # A tibble: 219 x 3
##    wordn sentiment     n
##    <chr> <chr>     <int>
##  1 不好  negative    302
##  2 希望  positive    271
##  3 清楚  positive    191
##  4 問題  negative    173
##  5 乾淨  positive    172
##  6 改善  positive    148
##  7 容易  positive    139
##  8 不便  negative    137
##  9 喜歡  positive    132
## 10 不足  negative    122
## # ... with 209 more rows

負評語各正負情緒詞出現次數前10名

word_count1 %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(wordn = reorder(wordn, n)) %>%
  ggplot(aes(wordn, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

總評語-增加各斷詞出現的總數

com_word <- tidybook %>%
    count(HotelName, word, sort = TRUE)

total_words <- com_word %>% 
  group_by(HotelName) %>%
  summarize(total = sum(n))

com_word <- left_join(com_word, total_words)

## Joining, by = "HotelName"

com_word

## # A tibble: 5,890 x 4
##    HotelName          word      n total
##    <chr>              <chr> <int> <int>
##  1 平湖窩行旅         規劃    250  7944
##  2 澎湖安一海景大飯店 hotel   224  6891
##  3 平湖窩行旅         不錯    212  7944
##  4 平湖窩行旅         質感    196  7944
##  5 百勝民宿           was     195  1756
##  6 豐谷大飯店         ok      188  1663
##  7 豐谷大飯店         卡拉    188  1663
##  8 日立大飯店         令人    187  6696
##  9 平湖窩行旅         整體    183  7944
## 10 寶華大飯店         hotel   182  4451
## # ... with 5,880 more rows

總評語-增加rank及詞性頻率欄位

freq_by_rank <- com_word %>%  
  group_by(HotelName) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank

## # A tibble: 5,890 x 6
## # Groups:   HotelName [199]
##    HotelName          word      n total  rank `term frequency`
##    <chr>              <chr> <int> <int> <int>            <dbl>
##  1 平湖窩行旅         規劃    250  7944     1           0.0315
##  2 澎湖安一海景大飯店 hotel   224  6891     1           0.0325
##  3 平湖窩行旅         不錯    212  7944     2           0.0267
##  4 平湖窩行旅         質感    196  7944     3           0.0247
##  5 百勝民宿           was     195  1756     1           0.111 
##  6 豐谷大飯店         ok      188  1663     1           0.113 
##  7 豐谷大飯店         卡拉    188  1663     2           0.113 
##  8 日立大飯店         令人    187  6696     1           0.0279
##  9 平湖窩行旅         整體    183  7944     4           0.0230
## 10 寶華大飯店         hotel   182  4451     1           0.0409
## # ... with 5,880 more rows

畫出rank及詞性頻率的關係線圖

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color =HotelName)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

examine Zipf’s law

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)

## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##    -0.07228     -1.32710

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = HotelName)) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

總評語bind_tf_idf

雅霖大飯店/澎澄飯店/豐谷大飯店/平湖窩行旅

bt_words1 <- com_word %>%
  bind_tf_idf(word, HotelName, n)

bt_words3<- (bt_words1[grepl("雅霖大飯店",bt_words1$HotelName),])
bt_words2<- (bt_words1[grepl("澎澄飯店" ,bt_words1$HotelName),])
bt_words4<- (bt_words1[grepl("豐谷大飯店" ,bt_words1$HotelName),])
bt_words5<- (bt_words1[grepl("平湖窩行旅" ,bt_words1$HotelName),])
bt_words<-rbind(bt_words2,bt_words3,bt_words5,bt_words4)

bt_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))

## # A tibble: 421 x 6
##    HotelName  word         n     tf   idf tf_idf
##    <chr>      <chr>    <int>  <dbl> <dbl>  <dbl>
##  1 豐谷大飯店 卡拉       188 0.113   5.29  0.598
##  2 豐谷大飯店 ok         188 0.113   3.50  0.396
##  3 豐谷大飯店 對面        94 0.0565  5.29  0.299
##  4 豐谷大飯店 整晚        94 0.0565  4.60  0.260
##  5 豐谷大飯店 太吵        94 0.0565  4.19  0.237
##  6 雅霖大飯店 low         71 0.0437  5.29  0.231
##  7 豐谷大飯店 噪音        94 0.0565  3.68  0.208
##  8 雅霖大飯店 services    71 0.0437  4.60  0.201
##  9 豐谷大飯店 解決        94 0.0565  3.50  0.198
## 10 豐谷大飯店 實惠        94 0.0565  3.50  0.198
## # ... with 411 more rows

總評語

這四家飯店前15個常出現的字詞畫視覺圖

bt_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(HotelName) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = HotelName)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~HotelName, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by tf_idf

找出四家飯店的bigram

bt_bigrams <- booking %>%
  unnest_tokens(bigram, Review, token = "ngrams", n = 2)
bt_bigrams1 <- (bt_bigrams[grepl("雅霖大飯店",bt_bigrams$HotelName),])
bt_bigrams2 <- (bt_bigrams[grepl("澎澄飯店",bt_bigrams$HotelName),])
bt_bigrams3 <- (bt_bigrams[grepl("豐谷大飯店",bt_bigrams$HotelName),])
bt_bigrams4 <- (bt_bigrams[grepl("平湖窩行旅",bt_bigrams$HotelName),])

bt_bigrams<-rbind(bt_bigrams1,bt_bigrams2, bt_bigrams3, bt_bigrams4)
 
bt_bigrams %>% na.omit(bt_bigrams)  %>% 
  count(bigram, sort = TRUE)

## # A tibble: 1,141 x 2
##    bigram            n
##    <chr>         <int>
##  1 傑出 傑出       208
##  2 好 好            71
##  3 非常 好          56
##  4 很 棒            56
##  5 好極了 好極了    42
##  6 令人 愉悅        29
##  7 好 非常          27
##  8 棒 很            25
##  9 入 住            10
## 10 愉悅 令人        10
## # ... with 1,131 more rows

依四家飯店資料整理bigram

bt_bigrams_separated <- bt_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bt_bigrams_filtered <- bt_bigrams_separated %>% na.omit(bt_bigram_counts) %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

bt_bigram_counts <- bt_bigrams_filtered %>% 
  count(word1, word2, sort = TRUE) %>% na.omit(bt_bigram_counts)

bt_bigram_united <- bt_bigrams_filtered %>% 
  unite(bigram, word1, word2, sep = " ") %>% na.omit(bt_bigram_united) %>% 
  select(HotelName,bigram)

bt_bigram_counts

## # A tibble: 1,048 x 3
##    word1  word2      n
##    <chr>  <chr>  <int>
##  1 傑出   傑出     208
##  2 好     好        71
##  3 非常   好        56
##  4 很     棒        56
##  5 好極了 好極了    42
##  6 令人   愉悅      29
##  7 好     非常      27
##  8 棒     很        25
##  9 入     住        10
## 10 愉悅   令人      10
## # ... with 1,038 more rows

常出現的前6種birgam組合

bt_bigram_tf_idf <- bt_bigram_united %>%
  count(HotelName, bigram) %>%
  bind_tf_idf(bigram, HotelName, n) %>%
  arrange(desc(tf_idf))

bt_bigram_tf_idf

## # A tibble: 1,114 x 6
##    HotelName  bigram            n      tf   idf  tf_idf
##    <chr>      <chr>         <int>   <dbl> <dbl>   <dbl>
##  1 豐谷大飯店 到 餐廳           2 0.0116  1.39  0.0161 
##  2 豐谷大飯店 空 的             2 0.0116  1.39  0.0161 
##  3 雅霖大飯店 去 澎湖           3 0.0102  1.39  0.0142 
##  4 雅霖大飯店 雅 霖             3 0.0102  1.39  0.0142 
##  5 雅霖大飯店 的 房間           2 0.00683 1.39  0.00946
##  6 雅霖大飯店 的 飯店           2 0.00683 1.39  0.00946
##  7 雅霖大飯店 霖 飯店           2 0.00683 1.39  0.00946
##  8 平湖窩行旅 好極了 好極了    33 0.0319  0.288 0.00918
##  9 豐谷大飯店 15 到             1 0.00581 1.39  0.00806
## 10 豐谷大飯店 2700 的           1 0.00581 1.39  0.00806
## # ... with 1,104 more rows

bt_bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(HotelName) %>% 
  top_n(6) %>% 
  ungroup() %>%
  ggplot(aes(bigram, tf_idf, fill = HotelName)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~HotelName, ncol = 4, scales = "free") +
  coord_flip()

## Selecting by tf_idf

期中專案_澎湖飯店評價分析

組員:鄭光廷01,張毓軒13,陳鈴俊14,黃谷鴻15

2019年4月23日

一.確立動機和分析目的

二.尋找適當的資料集和資料集的描述.

1.Booking和Agoda兩網站較適合旅館業,而Airbnb只有民宿,較不適合,本研究不採用.

2.利用網路爬蟲,從Booking和Agoda網站,找到澎湖地區營業量較大之旅館和民宿的顧客評論做為分析的資料來源.

3.Agoda約100家,Booking約320家業者,皆為.json檔,經初步轉換成data.frame形式,存成.csv檔

4.Agoda_Hotel有Hotel_ID,Link,Name三欄位,共100家業者資料.

5.Booking_Hotel有Link,Name二欄位,共323家業者資料.

6.Agoda_reviews有HotelName,Country,Rate,Review,ReviewDate,ReviewNeg,ReviewPos七欄位,共2493筆資料錄.

7.Booking_reviews有HotelName,Country,Rate,Review,ReviewDate,ReviewNeg,ReviewPos七欄位,共12255筆資料錄.

三.分析資料

四.解釋分析的結果(視覺化和結論)

載入套件

載入檔案

初步整理資料檔,ah和bh皆只剩下一個Name欄位

分析agoda飯店民宿比率(電腦做的不準確,若要百分百準確需以人工來處理)

agoda民宿65家,飯店26家,booking民宿253家,飯店58家

合併agoda和booking,並經由人工檢查是否有想同飯店使用不同名稱

結果全部業者共366家,民宿284家,飯店64家

將booking及agoda合併

計算agoda/booking/合併，各家業者之評分平均值，並排序

可以看出民宿的評比分數皆比較好，優於飯店的評比

自訂user word及停用字並用結巴斷詞

對booking的總評語做token化

製作總評語長條圖

製作總評語文字雲

製作負評語長條圖

增加情緒字典

總評語正負情緒對照文字雲

負評語正負情緒對照文字雲

總評語各飯店情緒差值

總評語各飯店情緒差值圖形化

負評語各飯店情緒差值圖形化

負評語各飯店情緒差值圖形化

總評語各情緒詞出現次數並遞減排序

分別呈現總評語正負情緒詞出現次數前10名

負評語各情緒詞出現次數並遞減排序

負評語各正負情緒詞出現次數前10名

總評語-增加各斷詞出現的總數

總評語-增加rank及詞性頻率欄位

畫出rank及詞性頻率的關係線圖

examine Zipf’s law

總評語bind_tf_idf

雅霖大飯店/澎澄飯店/豐谷大飯店/平湖窩行旅

總評語

這四家飯店前15個常出現的字詞畫視覺圖

找出四家飯店的bigram

依四家飯店資料整理bigram

常出現的前6種birgam組合