setwd("C:/learning/mid")
require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(data.table)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
require(scales)
## Loading required package: scales
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
booking <- fread("C:/learning/mid/booking_reviews.csv")
booking$ReviewDate = booking$ReviewDate %>% 
  as.Date("%Y年%sep,m月%sep,d日")
str(booking)
## Classes 'data.table' and 'data.frame':   12255 obs. of  11 variables:
##  $ V1        : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ HotelName : chr  "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" ...
##  $ Country   : chr  "臺灣" "臺灣" "臺灣" "臺灣" ...
##  $ Rate      : num  10 8.3 10 10 9.2 7.5 10 8.3 8.8 8.3 ...
##  $ Review    : chr  "服務人員態度很好。" "非常好" "傑出" "舒服" ...
##  $ ReviewDate: Date, format: NA NA ...
##  $ ReviewNeg : chr  "這次住宿沒有陽台" "" "" "每天早餐套餐都一樣。" ...
##  $ ReviewPos : chr  "" "在清明連假入住櫃檯人員鄭小姐服務親切友善即使遊客多也沒有不耐煩" "很乾淨,飯店人員很親切" "" ...
##  $ V9        : logi  NA NA NA NA NA NA ...
##  $ V10       : logi  NA NA NA NA NA NA ...
##  $ V11       : logi  NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr>
agoda <- fread("C:/learning/mid/agoda_reviews.csv")
agoda$ReviewDate = agoda$ReviewDate %>% 
  as.Date("%Y年%m月%d日")
str(agoda)
## Classes 'data.table' and 'data.frame':   2493 obs. of  8 variables:
##  $ V1        : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ HotelName : chr  "澎澄飯店 (Discovery Hotel)" "澎澄飯店 (Discovery Hotel)" "澎澄飯店 (Discovery Hotel)" "澎澄飯店 (Discovery Hotel)" ...
##  $ Country   : chr  "tw" "tw" "tw" "tw" ...
##  $ Rate      : num  10 9.2 10 10 9.2 10 10 9.2 8.8 9.2 ...
##  $ Review    : chr  "很有設計的飯店,不論是美感或是體貼房客使用的設計,另外正對港景的view令人讚賞,不論是白天的波光點點或是晚上的舒"| __truncated__ "" "" "位置佳,飯店有機場接駁,從飯店走到鬧區十來分鐘,散步很棒房間乾淨,小女有異位體質,不乾淨的地方會馬上過敏,住宿"| __truncated__ ...
##  $ ReviewDate: Date, format: "2019-02-22" "2018-12-08" ...
##  $ ReviewNeg : chr  "" "早餐不夠豐盛" "" "" ...
##  $ ReviewPos : chr  "" "房間乾淨,view不錯" "南海遊客中心對面交通便利,緊鄰PIER 3百貨非常棒" "" ...
##  - attr(*, ".internal.selfref")=<externalptr>
booking <- select(booking, HotelName, Country, Rate, Review, ReviewNeg)%>%
  mutate(web = "bk")
agoda<- select(booking, HotelName, Country, Rate, Review, ReviewNeg)%>% 
  mutate(web = "ag")
com <-rbind(booking,agoda)
jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")

book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}
liwc_p <- read_file("C:/learning/hw2/dict/liwc/positive.txt")
liwc_n <- read_file("C:/learning/hw2/dict/liwc/negative.txt")
#切出LIWC情緒的正負評價,製作對照表
positive <- strsplit(liwc_p, "[,]")[[1]]
negative <- strsplit(liwc_n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiment = "positive",stringsAsFactors = F)
negative <- data.frame(word = negative, sentiment = "negative",stringsAsFactors = F)

LIWC_ch = rbind(positive,negative)
head(LIWC_ch)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive
tidybook1 = com %>% unnest_tokens(word,Review,token= book_tokenizer)  %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% select(HotelName,Country,Rate,word,wordn)


head(tidybook1)
##        HotelName Country Rate word wordn
## 1     雅霖大飯店    臺灣   10 服務  這次
## 1.3   雅霖大飯店    臺灣   10 服務  住宿
## 1.4   雅霖大飯店    臺灣   10 服務  陽台
## 1.1   雅霖大飯店    臺灣   10 人員  這次
## 1.1.1 雅霖大飯店    臺灣   10 人員  住宿
## 1.1.2 雅霖大飯店    臺灣   10 人員  陽台
tokens_count1 <- tidybook1 %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>50) %>%
  arrange(desc(sum))
head(tokens_count1)
## # A tibble: 6 x 2
##   word     sum
##   <chr>  <int>
## 1 傑出    4866
## 2 令人    3786
## 3 很棒    3448
## 4 民宿    2524
## 5 愉悅    2298
## 6 好極了  2226
tokens_count1 %>% wordcloud2()
tokens_count2 <- tidybook1 %>% 
  filter(nchar(.$wordn)>1) %>%
  group_by(wordn) %>% 
  summarise(sum = n()) %>% 
  filter(sum>50) %>%
  arrange(desc(sum))
head(tokens_count1)
## # A tibble: 6 x 2
##   word     sum
##   <chr>  <int>
## 1 傑出    4866
## 2 令人    3786
## 3 很棒    3448
## 4 民宿    2524
## 5 愉悅    2298
## 6 好極了  2226
tokens_count2 %>% wordcloud2()
tidybook1 %>%
  count(word, sort = TRUE) %>%
  filter(n > 1000) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

tidybook1 %>%
  count(wordn, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(wordn, n)) %>%
  ggplot(aes(wordn, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

tokens_count1 %>% 
  inner_join(LIWC_ch) %>%
  select(word,sentiment,sum) %>%
  acast(word ~ sentiment,value.var = "sum", fill = 0) %>% 
  wordcloud::comparison.cloud(random.order=FALSE,colors = c("gray80", "indianred3"),max.words = 108)
## Joining, by = "word"

calsentiment <-tidybook1 %>%
  inner_join(LIWC_ch) %>%
  count(HotelName = HotelName, sentiment)%>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentimentx = positive - negative)
## Joining, by = "word"
head(calsentiment)
## # A tibble: 6 x 4
##   HotelName           negative positive sentimentx
##   <chr>                  <dbl>    <dbl>      <dbl>
## 1 ?樂民宿                   36      430        394
## 2 180度沙灘最前線民宿        0       10         10
## 3 198 紅帽民宿               0       74         74
## 4 2巷9號海景民宿             0        8          8
## 5 35民宿                     0       18         18
## 6 525民宿                    0      268        268
ggplot(calsentiment, aes(HotelName, sentimentx,fill = HotelName)) +
  geom_col(show.legend = FALSE,width = 0.8) +
  xlab("飯店")+
  ylab("情緒差值")

calsentiment$negative = calsentiment$negative * -1
calsentiment <-calsentiment %>%
    gather(key=sentiment , cnt,negative:positive)
head(calsentiment)
## # A tibble: 6 x 4
##   HotelName           sentimentx sentiment   cnt
##   <chr>                    <dbl> <chr>     <dbl>
## 1 ?樂民宿                    394 negative    -36
## 2 180度沙灘最前線民宿         10 negative      0
## 3 198 紅帽民宿                74 negative      0
## 4 2巷9號海景民宿               8 negative      0
## 5 35民宿                      18 negative      0
## 6 525民宿                    268 negative      0
ggplot(calsentiment, aes(HotelName, cnt,fill = sentiment)) +
      geom_col(show.legend = FALSE,width = 0.8) +
            xlab("HotelName")+
      ylab("情緒值")+
      geom_text(aes(label = sentimentx))

word_count <- tidybook1 %>%
  inner_join(LIWC_ch) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
word_count
## # A tibble: 145 x 3
##    word  sentiment     n
##    <chr> <chr>     <int>
##  1 傑出  positive   4866
##  2 很棒  positive   3448
##  3 愉悅  positive   2298
##  4 不錯  positive   1950
##  5 失望  negative   1620
##  6 舒適  positive   1570
##  7 親切  positive   1112
##  8 乾淨  positive   1084
##  9 問題  negative    958
## 10 值得  positive    780
## # ... with 135 more rows
word_count %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Selecting by n

word_count1 <- tidybook1 %>%
  inner_join(LIWC_ch) %>%
  count(wordn, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
word_count1
## # A tibble: 4,696 x 3
##    wordn sentiment     n
##    <chr> <chr>     <int>
##  1 房間  positive    476
##  2 早餐  positive    398
##  3 有點  positive    388
##  4 浴室  positive    280
##  5 隔音  positive    190
##  6 提供  positive    156
##  7 建議  positive    152
##  8 民宿  positive    140
##  9 比較  positive    132
## 10 希望  positive    130
## # ... with 4,686 more rows
word_count1 %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(wordn = reorder(wordn, n)) %>%
  ggplot(aes(wordn, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Selecting by n

com_word <- tidybook1 %>%
    count(HotelName, word, sort = TRUE)

total_words <- com_word %>% 
  group_by(HotelName) %>%
  summarize(total = sum(n))

com_word <- left_join(com_word, total_words)
## Joining, by = "HotelName"
com_word
## # A tibble: 5,890 x 4
##    HotelName          word      n total
##    <chr>              <chr> <int> <int>
##  1 平湖窩行旅         規劃    500 15888
##  2 澎湖安一海景大飯店 hotel   448 13782
##  3 平湖窩行旅         不錯    424 15888
##  4 平湖窩行旅         質感    392 15888
##  5 百勝民宿           was     390  3512
##  6 豐谷大飯店         ok      376  3326
##  7 豐谷大飯店         卡拉    376  3326
##  8 日立大飯店         令人    374 13392
##  9 平湖窩行旅         整體    366 15888
## 10 寶華大飯店         hotel   364  8902
## # ... with 5,880 more rows
freq_by_rank <- com_word %>%  
  group_by(HotelName) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank
## # A tibble: 5,890 x 6
## # Groups:   HotelName [199]
##    HotelName          word      n total  rank `term frequency`
##    <chr>              <chr> <int> <int> <int>            <dbl>
##  1 平湖窩行旅         規劃    500 15888     1           0.0315
##  2 澎湖安一海景大飯店 hotel   448 13782     1           0.0325
##  3 平湖窩行旅         不錯    424 15888     2           0.0267
##  4 平湖窩行旅         質感    392 15888     3           0.0247
##  5 百勝民宿           was     390  3512     1           0.111 
##  6 豐谷大飯店         ok      376  3326     1           0.113 
##  7 豐谷大飯店         卡拉    376  3326     2           0.113 
##  8 日立大飯店         令人    374 13392     1           0.0279
##  9 平湖窩行旅         整體    366 15888     4           0.0230
## 10 寶華大飯店         hotel   364  8902     1           0.0409
## # ... with 5,880 more rows
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color =HotelName)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##    -0.07228     -1.32710
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = HotelName)) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

book_words1 <- com_word %>%
  bind_tf_idf(word, HotelName, n)

book_words3<- (book_words1[grepl("雅霖大飯店",book_words1$HotelName),])
book_words2<- (book_words1[grepl("澎澄飯店" ,book_words1$HotelName),])
book_words4<- (book_words1[grepl("豐谷大飯店" ,book_words1$HotelName),])
book_words5<- (book_words1[grepl("彩虹文旅" ,book_words1$HotelName),])
book_words<-rbind(book_words2,book_words3,book_words5,book_words4)
book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(HotelName) %>% 
  top_n(10) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = HotelName)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~HotelName, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf

term_avg_tfidf=book_words1 %>% 
  group_by(word) %>% 
  summarise(
    tfidf_avg=mean(tf_idf)
  ) 
term_avg_tfidf %>% arrange(desc(tfidf_avg)) 
## # A tibble: 2,185 x 2
##    word       tfidf_avg
##    <chr>          <dbl>
##  1 中選            5.29
##  2 共和            2.65
##  3 刻薄            2.65
##  4 最后            2.34
##  5 formidable      1.92
##  6 jour            1.92
##  7 優缺點          1.81
##  8 之外            1.76
##  9 擁有            1.76
## 10 輕鬆自在        1.70
## # ... with 2,175 more rows
term_avg_tfidf$tfidf_avg %>% summary
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000666 0.017026 0.054711 0.112539 0.119476 5.293305
term_remove=term_avg_tfidf %>%  
  filter(tfidf_avg<0.017026) %>% 
  .$word
term_remove %>% head
## [1] "00"   "11"   "12"   "2200" "2700" "30"