安裝所需套件

setwd("C:/learning/mid")
require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(data.table)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
require(scales)
## Loading required package: scales
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

載入檔案

agoda<- fread("agoda_reviews.csv")
agoda1<- fread("agoda_reviews2.csv")

結巴斷詞

jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")

book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

對agoda的正評語做斷詞

tidybook = agoda %>% unnest_tokens(wordp,ReviewPos,token= book_tokenizer) %>% select(HotelName,Country,Rate,wordp)

head(tidybook)
##                     HotelName Country Rate wordp
## 1: 澎澄飯店 (Discovery Hotel)      tw  9.2  房間
## 2: 澎澄飯店 (Discovery Hotel)      tw  9.2  乾淨
## 3: 澎澄飯店 (Discovery Hotel)      tw  9.2  view
## 4: 澎澄飯店 (Discovery Hotel)      tw  9.2  不錯
## 5: 澎澄飯店 (Discovery Hotel)      tw 10.0  南海
## 6: 澎澄飯店 (Discovery Hotel)      tw 10.0  遊客

出現大於100次的畫長條圖

tidybook %>%
  count(wordp, sort = TRUE) %>%
  filter(n > 100) %>%
  mutate(wordp = reorder(wordp, n)) %>%
  ggplot(aes(wordp, n)) +
  geom_col() +
  xlab(NULL) +
  ylab("出現次數") +
  coord_flip()

文字雲

tokens_count <- tidybook %>% 
  filter(nchar(.$wordp)>1) %>%
  group_by(wordp) %>% 
  summarise(sum = n()) %>% 
  filter(sum>20) %>%
  arrange(desc(sum))
head(tokens_count)
## # A tibble: 6 x 2
##   wordp   sum
##   <chr> <int>
## 1 服務    373
## 2 房間    337
## 3 早餐    320
## 4 親切    272
## 5 舒適    255
## 6 乾淨    244
tokens_count %>% wordcloud2()

增加情緒字典

liwc_p <- read_file("C:/learning/hw2/dict/liwc/positive.txt")
liwc_n <- read_file("C:/learning/hw2/dict/liwc/negative.txt")

#切出LIWC情緒的正負評價,製作對照表
positive <- strsplit(liwc_p, "[,]")[[1]]
negative <- strsplit(liwc_n, "[,]")[[1]]

positive <- data.frame(wordp = positive, sentiment = "positive",stringsAsFactors = F)
negative <- data.frame(wordp = negative, sentiment = "negative",stringsAsFactors = F)

LIWC_ch = rbind(positive,negative)
head(LIWC_ch)
##      wordp sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

各飯店情緒差值

bt_sentiment <-tidybook %>%
  inner_join(LIWC_ch) %>%
  count(HotelName = HotelName, sentiment)%>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentimentx = positive - negative)
## Joining, by = "wordp"
head(bt_sentiment)
## # A tibble: 6 x 4
##   HotelName                                   negative positive sentimentx
##   <chr>                                          <dbl>    <dbl>      <dbl>
## 1 180度沙灘最前線民宿  (Beach Front Villa)           0       14         14
## 2 Happy Lodge Guest House                            0       19         19
## 3 一二三石頭人民宿 (Penghu 123 V-Stone B & B)        4      162        158
## 4 人魚之丘度假旅店 (La Villa de la Sirene)           0        6          6
## 5 元泰大飯店 (Yentai Hotel)                          2       99         97
## 6 日立大飯店 (Jih Lih Hotel)                         1       72         71
ggplot(bt_sentiment, aes(HotelName, sentimentx,fill = HotelName)) +
  geom_col(show.legend = FALSE,width = 0.8) +
  xlab("飯店")+
  ylab("情緒差值")

各情緒詞出現次數並遞減排序

  word_count <- tidybook %>%
  inner_join(LIWC_ch) %>%
  count(wordp, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "wordp"
word_count
## # A tibble: 157 x 3
##    wordp sentiment     n
##    <chr> <chr>     <int>
##  1 親切  positive    272
##  2 舒適  positive    255
##  3 乾淨  positive    244
##  4 不錯  positive    137
##  5 很棒  positive    105
##  6 好吃  positive     87
##  7 熱情  positive     76
##  8 舒服  positive     75
##  9 熱心  positive     56
## 10 整潔  positive     53
## # ... with 147 more rows

分別呈現正負情緒詞出現次數前10名

word_count %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(wordp = reorder(wordp, n)) %>%
  ggplot(aes(wordp, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Selecting by n

增加各斷詞出現的總數

com_word <- tidybook %>%
    count(HotelName, wordp, sort = TRUE)

total_words <- com_word %>% 
  group_by(HotelName) %>%
  summarize(total = sum(n))

com_word <- left_join(com_word, total_words)
## Joining, by = "HotelName"
com_word
## # A tibble: 7,132 x 4
##    HotelName                                      wordp     n total
##    <chr>                                          <chr> <int> <int>
##  1 一二三石頭人民宿 (Penghu 123 V-Stone B & B)    老闆     60   617
##  2 綠的旅店 (Green Hotel)                         服務     51   760
##  3 安一海景大飯店 (AN-I Sea View Hotel)           早餐     37   495
##  4 雅霖大飯店 (Ya Ling Hotel)                     服務     35   671
##  5 雅霖大飯店 (Ya Ling Hotel)                     早餐     33   671
##  6 元泰大飯店 (Yentai Hotel)                      房間     30   604
##  7 幸運海彎海景民宿 (Lucky Bay Bed and Breakfast) 民宿     29  1068
##  8 幸運海彎海景民宿 (Lucky Bay Bed and Breakfast) 早餐     29  1068
##  9 一二三石頭人民宿 (Penghu 123 V-Stone B & B)    乾淨     28   617
## 10 幸運海彎海景民宿 (Lucky Bay Bed and Breakfast) 乾淨     28  1068
## # ... with 7,122 more rows

增加rank及詞性頻率欄位

freq_by_rank <- com_word %>%  
  group_by(HotelName) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank
## # A tibble: 7,132 x 6
## # Groups:   HotelName [33]
##    HotelName                       wordp     n total  rank `term frequency`
##    <chr>                           <chr> <int> <int> <int>            <dbl>
##  1 一二三石頭人民宿 (Penghu 123 V-Stone B~ 老闆     60   617     1           0.0972
##  2 綠的旅店 (Green Hotel)          服務     51   760     1           0.0671
##  3 安一海景大飯店 (AN-I Sea View Hotel)~ 早餐     37   495     1           0.0747
##  4 雅霖大飯店 (Ya Ling Hotel)      服務     35   671     1           0.0522
##  5 雅霖大飯店 (Ya Ling Hotel)      早餐     33   671     2           0.0492
##  6 元泰大飯店 (Yentai Hotel)       房間     30   604     1           0.0497
##  7 幸運海彎海景民宿 (Lucky Bay Bed and Br~ 民宿     29  1068     1           0.0272
##  8 幸運海彎海景民宿 (Lucky Bay Bed and Br~ 早餐     29  1068     2           0.0272
##  9 一二三石頭人民宿 (Penghu 123 V-Stone B~ 乾淨     28   617     2           0.0454
## 10 幸運海彎海景民宿 (Lucky Bay Bed and Br~ 乾淨     28  1068     3           0.0262
## # ... with 7,122 more rows

畫出rank及詞性頻率的關係線圖,examine Zipf’s law

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = HotelName)) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

總評語bind_tf_idf

雅霖大飯店/幸運海彎海景民宿/澎澄飯店/平湖窩行旅

bt_words1 <- com_word %>%
  bind_tf_idf(wordp, HotelName, n)

bt_words3<- (bt_words1[grepl("幸運海彎海景民宿",bt_words1$HotelName),])
bt_words2<- (bt_words1[grepl("雅霖大飯店" ,bt_words1$HotelName),])
bt_words4<- (bt_words1[grepl("澎澄飯店" ,bt_words1$HotelName),])
bt_words5<- (bt_words1[grepl("平湖窩行旅" ,bt_words1$HotelName),])
bt_words<-rbind(bt_words2,bt_words3,bt_words5,bt_words4)

bt_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 1,368 x 6
##    HotelName                               wordp     n      tf   idf tf_idf
##    <chr>                                   <chr> <int>   <dbl> <dbl>  <dbl>
##  1 幸運海彎海景民宿 (Lucky Bay Bed and Breakfast)~ 老闆娘~    23 0.0215  1.10  0.0237
##  2 雅霖大飯店 (Ya Ling Hotel)              飯店     19 0.0283  0.724 0.0205
##  3 澎澄飯店 (Discovery Hotel)              飯店     10 0.0257  0.724 0.0186
##  4 平湖窩行旅 (PH Hostel)                  吐司      3 0.00523 3.50  0.0183
##  5 平湖窩行旅 (PH Hostel)                  背包客~     3 0.00523 3.50  0.0183
##  6 平湖窩行旅 (PH Hostel)                  餐包      3 0.00523 3.50  0.0183
##  7 澎澄飯店 (Discovery Hotel)              28        2 0.00514 3.50  0.0180
##  8 澎澄飯店 (Discovery Hotel)              失望      2 0.00514 3.50  0.0180
##  9 澎澄飯店 (Discovery Hotel)              免稅店~     2 0.00514 3.50  0.0180
## 10 澎澄飯店 (Discovery Hotel)              尚未      2 0.00514 3.50  0.0180
## # ... with 1,358 more rows

這四家飯店前15個常出現的字詞畫視覺圖

bt_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(wordp = factor(wordp, levels = rev(unique(wordp)))) %>% 
  group_by(HotelName) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(wordp, tf_idf, fill = HotelName)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~HotelName, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf