安裝所需套件
setwd("C:/learning/mid")
require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(scales)
## Loading required package: scales
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
載入檔案
agoda<- fread("agoda_reviews.csv")
agoda1<- fread("agoda_reviews2.csv")
結巴斷詞
jieba_tokenizer <- worker(stop_word ="stop_words.txt",user="user_words.txt")
book_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
對agoda的正評語做斷詞
tidybook = agoda %>% unnest_tokens(wordp,ReviewPos,token= book_tokenizer) %>% select(HotelName,Country,Rate,wordp)
head(tidybook)
## HotelName Country Rate wordp
## 1: 澎澄飯店 (Discovery Hotel) tw 9.2 房間
## 2: 澎澄飯店 (Discovery Hotel) tw 9.2 乾淨
## 3: 澎澄飯店 (Discovery Hotel) tw 9.2 view
## 4: 澎澄飯店 (Discovery Hotel) tw 9.2 不錯
## 5: 澎澄飯店 (Discovery Hotel) tw 10.0 南海
## 6: 澎澄飯店 (Discovery Hotel) tw 10.0 遊客
出現大於100次的畫長條圖
tidybook %>%
count(wordp, sort = TRUE) %>%
filter(n > 100) %>%
mutate(wordp = reorder(wordp, n)) %>%
ggplot(aes(wordp, n)) +
geom_col() +
xlab(NULL) +
ylab("出現次數") +
coord_flip()

文字雲
tokens_count <- tidybook %>%
filter(nchar(.$wordp)>1) %>%
group_by(wordp) %>%
summarise(sum = n()) %>%
filter(sum>20) %>%
arrange(desc(sum))
head(tokens_count)
## # A tibble: 6 x 2
## wordp sum
## <chr> <int>
## 1 服務 373
## 2 房間 337
## 3 早餐 320
## 4 親切 272
## 5 舒適 255
## 6 乾淨 244
tokens_count %>% wordcloud2()
增加情緒字典
liwc_p <- read_file("C:/learning/hw2/dict/liwc/positive.txt")
liwc_n <- read_file("C:/learning/hw2/dict/liwc/negative.txt")
#切出LIWC情緒的正負評價,製作對照表
positive <- strsplit(liwc_p, "[,]")[[1]]
negative <- strsplit(liwc_n, "[,]")[[1]]
positive <- data.frame(wordp = positive, sentiment = "positive",stringsAsFactors = F)
negative <- data.frame(wordp = negative, sentiment = "negative",stringsAsFactors = F)
LIWC_ch = rbind(positive,negative)
head(LIWC_ch)
## wordp sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
各飯店情緒差值
bt_sentiment <-tidybook %>%
inner_join(LIWC_ch) %>%
count(HotelName = HotelName, sentiment)%>%
spread(sentiment, n, fill = 0) %>%
mutate(sentimentx = positive - negative)
## Joining, by = "wordp"
head(bt_sentiment)
## # A tibble: 6 x 4
## HotelName negative positive sentimentx
## <chr> <dbl> <dbl> <dbl>
## 1 180度沙灘最前線民宿 (Beach Front Villa) 0 14 14
## 2 Happy Lodge Guest House 0 19 19
## 3 一二三石頭人民宿 (Penghu 123 V-Stone B & B) 4 162 158
## 4 人魚之丘度假旅店 (La Villa de la Sirene) 0 6 6
## 5 元泰大飯店 (Yentai Hotel) 2 99 97
## 6 日立大飯店 (Jih Lih Hotel) 1 72 71
ggplot(bt_sentiment, aes(HotelName, sentimentx,fill = HotelName)) +
geom_col(show.legend = FALSE,width = 0.8) +
xlab("飯店")+
ylab("情緒差值")

各情緒詞出現次數並遞減排序
word_count <- tidybook %>%
inner_join(LIWC_ch) %>%
count(wordp, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "wordp"
word_count
## # A tibble: 157 x 3
## wordp sentiment n
## <chr> <chr> <int>
## 1 親切 positive 272
## 2 舒適 positive 255
## 3 乾淨 positive 244
## 4 不錯 positive 137
## 5 很棒 positive 105
## 6 好吃 positive 87
## 7 熱情 positive 76
## 8 舒服 positive 75
## 9 熱心 positive 56
## 10 整潔 positive 53
## # ... with 147 more rows
分別呈現正負情緒詞出現次數前10名
word_count %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(wordp = reorder(wordp, n)) %>%
ggplot(aes(wordp, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n

增加各斷詞出現的總數
com_word <- tidybook %>%
count(HotelName, wordp, sort = TRUE)
total_words <- com_word %>%
group_by(HotelName) %>%
summarize(total = sum(n))
com_word <- left_join(com_word, total_words)
## Joining, by = "HotelName"
com_word
## # A tibble: 7,132 x 4
## HotelName wordp n total
## <chr> <chr> <int> <int>
## 1 一二三石頭人民宿 (Penghu 123 V-Stone B & B) 老闆 60 617
## 2 綠的旅店 (Green Hotel) 服務 51 760
## 3 安一海景大飯店 (AN-I Sea View Hotel) 早餐 37 495
## 4 雅霖大飯店 (Ya Ling Hotel) 服務 35 671
## 5 雅霖大飯店 (Ya Ling Hotel) 早餐 33 671
## 6 元泰大飯店 (Yentai Hotel) 房間 30 604
## 7 幸運海彎海景民宿 (Lucky Bay Bed and Breakfast) 民宿 29 1068
## 8 幸運海彎海景民宿 (Lucky Bay Bed and Breakfast) 早餐 29 1068
## 9 一二三石頭人民宿 (Penghu 123 V-Stone B & B) 乾淨 28 617
## 10 幸運海彎海景民宿 (Lucky Bay Bed and Breakfast) 乾淨 28 1068
## # ... with 7,122 more rows
增加rank及詞性頻率欄位
freq_by_rank <- com_word %>%
group_by(HotelName) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
## # A tibble: 7,132 x 6
## # Groups: HotelName [33]
## HotelName wordp n total rank `term frequency`
## <chr> <chr> <int> <int> <int> <dbl>
## 1 一二三石頭人民宿 (Penghu 123 V-Stone B~ 老闆 60 617 1 0.0972
## 2 綠的旅店 (Green Hotel) 服務 51 760 1 0.0671
## 3 安一海景大飯店 (AN-I Sea View Hotel)~ 早餐 37 495 1 0.0747
## 4 雅霖大飯店 (Ya Ling Hotel) 服務 35 671 1 0.0522
## 5 雅霖大飯店 (Ya Ling Hotel) 早餐 33 671 2 0.0492
## 6 元泰大飯店 (Yentai Hotel) 房間 30 604 1 0.0497
## 7 幸運海彎海景民宿 (Lucky Bay Bed and Br~ 民宿 29 1068 1 0.0272
## 8 幸運海彎海景民宿 (Lucky Bay Bed and Br~ 早餐 29 1068 2 0.0272
## 9 一二三石頭人民宿 (Penghu 123 V-Stone B~ 乾淨 28 617 2 0.0454
## 10 幸運海彎海景民宿 (Lucky Bay Bed and Br~ 乾淨 28 1068 3 0.0262
## # ... with 7,122 more rows
畫出rank及詞性頻率的關係線圖,examine Zipf’s law
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = HotelName)) +
geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()

總評語bind_tf_idf
雅霖大飯店/幸運海彎海景民宿/澎澄飯店/平湖窩行旅
bt_words1 <- com_word %>%
bind_tf_idf(wordp, HotelName, n)
bt_words3<- (bt_words1[grepl("幸運海彎海景民宿",bt_words1$HotelName),])
bt_words2<- (bt_words1[grepl("雅霖大飯店" ,bt_words1$HotelName),])
bt_words4<- (bt_words1[grepl("澎澄飯店" ,bt_words1$HotelName),])
bt_words5<- (bt_words1[grepl("平湖窩行旅" ,bt_words1$HotelName),])
bt_words<-rbind(bt_words2,bt_words3,bt_words5,bt_words4)
bt_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 1,368 x 6
## HotelName wordp n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 幸運海彎海景民宿 (Lucky Bay Bed and Breakfast)~ 老闆娘~ 23 0.0215 1.10 0.0237
## 2 雅霖大飯店 (Ya Ling Hotel) 飯店 19 0.0283 0.724 0.0205
## 3 澎澄飯店 (Discovery Hotel) 飯店 10 0.0257 0.724 0.0186
## 4 平湖窩行旅 (PH Hostel) 吐司 3 0.00523 3.50 0.0183
## 5 平湖窩行旅 (PH Hostel) 背包客~ 3 0.00523 3.50 0.0183
## 6 平湖窩行旅 (PH Hostel) 餐包 3 0.00523 3.50 0.0183
## 7 澎澄飯店 (Discovery Hotel) 28 2 0.00514 3.50 0.0180
## 8 澎澄飯店 (Discovery Hotel) 失望 2 0.00514 3.50 0.0180
## 9 澎澄飯店 (Discovery Hotel) 免稅店~ 2 0.00514 3.50 0.0180
## 10 澎澄飯店 (Discovery Hotel) 尚未 2 0.00514 3.50 0.0180
## # ... with 1,358 more rows
這四家飯店前15個常出現的字詞畫視覺圖
bt_words %>%
arrange(desc(tf_idf)) %>%
mutate(wordp = factor(wordp, levels = rev(unique(wordp)))) %>%
group_by(HotelName) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(wordp, tf_idf, fill = HotelName)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~HotelName, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
