setwd("C:/learning/mid")
require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(scales)
## Loading required package: scales
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
booking <- fread("C:/learning/mid/booking_reviews.csv")
booking$ReviewDate = booking$ReviewDate %>%
as.Date("%Y年%sep,m月%sep,d日")
str(booking)
## Classes 'data.table' and 'data.frame': 12255 obs. of 11 variables:
## $ V1 : int 0 1 2 3 4 5 6 7 8 9 ...
## $ HotelName : chr "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" "雅霖大飯店" ...
## $ Country : chr "臺灣" "臺灣" "臺灣" "臺灣" ...
## $ Rate : num 10 8.3 10 10 9.2 7.5 10 8.3 8.8 8.3 ...
## $ Review : chr "服務人員態度很好。" "非常好" "傑出" "舒服" ...
## $ ReviewDate: Date, format: NA NA ...
## $ ReviewNeg : chr "這次住宿沒有陽台" "" "" "每天早餐套餐都一樣。" ...
## $ ReviewPos : chr "" "在清明連假入住櫃檯人員鄭小姐服務親切友善即使遊客多也沒有不耐煩" "很乾淨,飯店人員很親切" "" ...
## $ V9 : logi NA NA NA NA NA NA ...
## $ V10 : logi NA NA NA NA NA NA ...
## $ V11 : logi NA NA NA NA NA NA ...
## - attr(*, ".internal.selfref")=<externalptr>
agoda <- fread("C:/learning/mid/agoda_reviews.csv")
agoda$ReviewDate = agoda$ReviewDate %>%
as.Date("%Y年%m月%d日")
str(agoda)
## Classes 'data.table' and 'data.frame': 2493 obs. of 8 variables:
## $ V1 : int 0 1 2 3 4 5 6 7 8 9 ...
## $ HotelName : chr "澎澄飯店 (Discovery Hotel)" "澎澄飯店 (Discovery Hotel)" "澎澄飯店 (Discovery Hotel)" "澎澄飯店 (Discovery Hotel)" ...
## $ Country : chr "tw" "tw" "tw" "tw" ...
## $ Rate : num 10 9.2 10 10 9.2 10 10 9.2 8.8 9.2 ...
## $ Review : chr "很有設計的飯店,不論是美感或是體貼房客使用的設計,另外正對港景的view令人讚賞,不論是白天的波光點點或是晚上的舒"| __truncated__ "" "" "位置佳,飯店有機場接駁,從飯店走到鬧區十來分鐘,散步很棒房間乾淨,小女有異位體質,不乾淨的地方會馬上過敏,住宿"| __truncated__ ...
## $ ReviewDate: Date, format: "2019-02-22" "2018-12-08" ...
## $ ReviewNeg : chr "" "早餐不夠豐盛" "" "" ...
## $ ReviewPos : chr "" "房間乾淨,view不錯" "南海遊客中心對面交通便利,緊鄰PIER 3百貨非常棒" "" ...
## - attr(*, ".internal.selfref")=<externalptr>
booking <- select(booking, HotelName, Country, Rate, Review, ReviewNeg)%>%
mutate(web = "bk")
agoda<- select(booking, HotelName, Country, Rate, Review, ReviewNeg)%>%
mutate(web = "ag")
com <-rbind(booking,agoda)
jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")
book_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
liwc_p <- read_file("C:/learning/hw2/dict/liwc/positive.txt")
liwc_n <- read_file("C:/learning/hw2/dict/liwc/negative.txt")
#切出LIWC情緒的正負評價,製作對照表
positive <- strsplit(liwc_p, "[,]")[[1]]
negative <- strsplit(liwc_n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiment = "positive",stringsAsFactors = F)
negative <- data.frame(word = negative, sentiment = "negative",stringsAsFactors = F)
LIWC_ch = rbind(positive,negative)
head(LIWC_ch)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
tidybook1 = com %>% unnest_tokens(word,Review,token= book_tokenizer) %>% unnest_tokens(wordn,ReviewNeg,token= book_tokenizer) %>% select(HotelName,Country,Rate,word,wordn)
head(tidybook1)
## HotelName Country Rate word wordn
## 1 雅霖大飯店 臺灣 10 服務 這次
## 1.3 雅霖大飯店 臺灣 10 服務 住宿
## 1.4 雅霖大飯店 臺灣 10 服務 陽台
## 1.1 雅霖大飯店 臺灣 10 人員 這次
## 1.1.1 雅霖大飯店 臺灣 10 人員 住宿
## 1.1.2 雅霖大飯店 臺灣 10 人員 陽台
tokens_count1 <- tidybook1 %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>50) %>%
arrange(desc(sum))
head(tokens_count1)
## # A tibble: 6 x 2
## word sum
## <chr> <int>
## 1 傑出 4866
## 2 令人 3786
## 3 很棒 3448
## 4 民宿 2524
## 5 愉悅 2298
## 6 好極了 2226
tokens_count1 %>% wordcloud2()
tokens_count2 <- tidybook1 %>%
filter(nchar(.$wordn)>1) %>%
group_by(wordn) %>%
summarise(sum = n()) %>%
filter(sum>50) %>%
arrange(desc(sum))
head(tokens_count1)
## # A tibble: 6 x 2
## word sum
## <chr> <int>
## 1 傑出 4866
## 2 令人 3786
## 3 很棒 3448
## 4 民宿 2524
## 5 愉悅 2298
## 6 好極了 2226
tokens_count2 %>% wordcloud2()
tidybook1 %>%
count(word, sort = TRUE) %>%
filter(n > 1000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()

tidybook1 %>%
count(wordn, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(wordn, n)) %>%
ggplot(aes(wordn, n)) +
geom_col() +
xlab(NULL) +
coord_flip()

tokens_count1 %>%
inner_join(LIWC_ch) %>%
select(word,sentiment,sum) %>%
acast(word ~ sentiment,value.var = "sum", fill = 0) %>%
wordcloud::comparison.cloud(random.order=FALSE,colors = c("gray80", "indianred3"),max.words = 108)
## Joining, by = "word"

calsentiment <-tidybook1 %>%
inner_join(LIWC_ch) %>%
count(HotelName = HotelName, sentiment)%>%
spread(sentiment, n, fill = 0) %>%
mutate(sentimentx = positive - negative)
## Joining, by = "word"
head(calsentiment)
## # A tibble: 6 x 4
## HotelName negative positive sentimentx
## <chr> <dbl> <dbl> <dbl>
## 1 ?樂民宿 36 430 394
## 2 180度沙灘最前線民宿 0 10 10
## 3 198 紅帽民宿 0 74 74
## 4 2巷9號海景民宿 0 8 8
## 5 35民宿 0 18 18
## 6 525民宿 0 268 268
ggplot(calsentiment, aes(HotelName, sentimentx,fill = HotelName)) +
geom_col(show.legend = FALSE,width = 0.8) +
xlab("飯店")+
ylab("情緒差值")

calsentiment$negative = calsentiment$negative * -1
calsentiment <-calsentiment %>%
gather(key=sentiment , cnt,negative:positive)
head(calsentiment)
## # A tibble: 6 x 4
## HotelName sentimentx sentiment cnt
## <chr> <dbl> <chr> <dbl>
## 1 ?樂民宿 394 negative -36
## 2 180度沙灘最前線民宿 10 negative 0
## 3 198 紅帽民宿 74 negative 0
## 4 2巷9號海景民宿 8 negative 0
## 5 35民宿 18 negative 0
## 6 525民宿 268 negative 0
ggplot(calsentiment, aes(HotelName, cnt,fill = sentiment)) +
geom_col(show.legend = FALSE,width = 0.8) +
xlab("HotelName")+
ylab("情緒值")+
geom_text(aes(label = sentimentx))

word_count <- tidybook1 %>%
inner_join(LIWC_ch) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
word_count
## # A tibble: 145 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 傑出 positive 4866
## 2 很棒 positive 3448
## 3 愉悅 positive 2298
## 4 不錯 positive 1950
## 5 失望 negative 1620
## 6 舒適 positive 1570
## 7 親切 positive 1112
## 8 乾淨 positive 1084
## 9 問題 negative 958
## 10 值得 positive 780
## # ... with 135 more rows
word_count %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n

word_count1 <- tidybook1 %>%
inner_join(LIWC_ch) %>%
count(wordn, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
word_count1
## # A tibble: 4,696 x 3
## wordn sentiment n
## <chr> <chr> <int>
## 1 房間 positive 476
## 2 早餐 positive 398
## 3 有點 positive 388
## 4 浴室 positive 280
## 5 隔音 positive 190
## 6 提供 positive 156
## 7 建議 positive 152
## 8 民宿 positive 140
## 9 比較 positive 132
## 10 希望 positive 130
## # ... with 4,686 more rows
word_count1 %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(wordn = reorder(wordn, n)) %>%
ggplot(aes(wordn, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n

com_word <- tidybook1 %>%
count(HotelName, word, sort = TRUE)
total_words <- com_word %>%
group_by(HotelName) %>%
summarize(total = sum(n))
com_word <- left_join(com_word, total_words)
## Joining, by = "HotelName"
com_word
## # A tibble: 5,890 x 4
## HotelName word n total
## <chr> <chr> <int> <int>
## 1 平湖窩行旅 規劃 500 15888
## 2 澎湖安一海景大飯店 hotel 448 13782
## 3 平湖窩行旅 不錯 424 15888
## 4 平湖窩行旅 質感 392 15888
## 5 百勝民宿 was 390 3512
## 6 豐谷大飯店 ok 376 3326
## 7 豐谷大飯店 卡拉 376 3326
## 8 日立大飯店 令人 374 13392
## 9 平湖窩行旅 整體 366 15888
## 10 寶華大飯店 hotel 364 8902
## # ... with 5,880 more rows
freq_by_rank <- com_word %>%
group_by(HotelName) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
## # A tibble: 5,890 x 6
## # Groups: HotelName [199]
## HotelName word n total rank `term frequency`
## <chr> <chr> <int> <int> <int> <dbl>
## 1 平湖窩行旅 規劃 500 15888 1 0.0315
## 2 澎湖安一海景大飯店 hotel 448 13782 1 0.0325
## 3 平湖窩行旅 不錯 424 15888 2 0.0267
## 4 平湖窩行旅 質感 392 15888 3 0.0247
## 5 百勝民宿 was 390 3512 1 0.111
## 6 豐谷大飯店 ok 376 3326 1 0.113
## 7 豐谷大飯店 卡拉 376 3326 2 0.113
## 8 日立大飯店 令人 374 13392 1 0.0279
## 9 平湖窩行旅 整體 366 15888 4 0.0230
## 10 寶華大飯店 hotel 364 8902 1 0.0409
## # ... with 5,880 more rows
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color =HotelName)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()

rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.07228 -1.32710
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = HotelName)) +
geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()

book_words1 <- com_word %>%
bind_tf_idf(word, HotelName, n)
book_words3<- (book_words1[grepl("雅霖大飯店",book_words1$HotelName),])
book_words2<- (book_words1[grepl("澎澄飯店" ,book_words1$HotelName),])
book_words4<- (book_words1[grepl("豐谷大飯店" ,book_words1$HotelName),])
book_words5<- (book_words1[grepl("彩虹文旅" ,book_words1$HotelName),])
book_words<-rbind(book_words2,book_words3,book_words5,book_words4)
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(HotelName) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = HotelName)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~HotelName, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf

term_avg_tfidf=book_words1 %>%
group_by(word) %>%
summarise(
tfidf_avg=mean(tf_idf)
)
term_avg_tfidf %>% arrange(desc(tfidf_avg))
## # A tibble: 2,185 x 2
## word tfidf_avg
## <chr> <dbl>
## 1 中選 5.29
## 2 共和 2.65
## 3 刻薄 2.65
## 4 最后 2.34
## 5 formidable 1.92
## 6 jour 1.92
## 7 優缺點 1.81
## 8 之外 1.76
## 9 擁有 1.76
## 10 輕鬆自在 1.70
## # ... with 2,175 more rows
term_avg_tfidf$tfidf_avg %>% summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000666 0.017026 0.054711 0.112539 0.119476 5.293305
term_remove=term_avg_tfidf %>%
filter(tfidf_avg<0.017026) %>%
.$word
term_remove %>% head
## [1] "00" "11" "12" "2200" "2700" "30"