使用Text Mining with R - A Tidy Approach ch1~ch6 分析Hilton Hawaiian Village Tripadvisor review
Ch1.The tidy text format
library(dplyr)
library(readr)
library(lubridate)
library(ggplot2)
library(tidytext)
library(tidyverse)
library(stringr)
library(tidyr)
library(scales)
library(broom)
library(purrr)
library(widyr)
library(igraph)
library(ggraph)
library(SnowballC)
library(wordcloud)
library(reshape2)
library(topicmodels)
theme_set(theme_minimal())
df <- read_csv("Hilton_Hawaiian_Village_Waikiki_Beach_Resort-Honolulu_Oahu_Hawaii__en.csv")
Parsed with column specification:
cols(
review_body = [31mcol_character()[39m,
review_date = [31mcol_character()[39m
)
df <- df[complete.cases(df), ]
df$id <- c(1:nrow(df))
df$review_date <- as.Date(df$review_date, format = "%d-%B-%y")
dim(df); min(df$review_date); max(df$review_date)
[1] 13701 3
[1] "2002-03-21"
[1] "2018-08-02"
df$review_body = gsub("[[:digit:]]", "", df$review_body) #去除數字 避免後面關聯圖出現很多數字
tidy_df <- df %>%
unnest_tokens(word, review_body)
data(stop_words)
tidy_df <- tidy_df %>%
anti_join(stop_words)
Joining, by = "word"
全部評論中的字頻
tidy_df %>%
count(word, sort = TRUE)
tidy_df %>%
count(word, sort = TRUE) %>%
filter(n > 5500) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()

Ch2.Sentiment analysis
使用字典nrc 查看在評論中出現哪些joy的字
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_df %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE) #注意的點是beach,food,diamond...也在joy裡
Joining, by = "word"
可以看出主要是關於飯店的整潔clean,友善friendly,helpful,環境或建築的美麗pretty….等
原tidytext文章範例是以小說篇章編號為x軸,y軸為情緒分數來做圖,表現小說情緒依據劇情演進而產生的變化
不過hotel review就要變成以x軸為日期了
#日期從2002-03-21~2018-08-02
summary(tidy_df$review_date)
Min. 1st Qu. Median Mean 3rd Qu. Max.
"2002-03-21" "2012-01-05" "2014-04-28" "2013-08-26" "2016-03-14" "2018-08-02"
sentiment <- tidy_df %>%
inner_join(get_sentiments("bing")) %>%
count(id,sentiment,review_date) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
Joining, by = "word"
#col sentiment為總分數
sen_byyear <- sentiment %>% group_by(year(review_date)) %>%
summarise(
sentiment = mean(sentiment))%>%
as.data.frame()
sen_byyear$`year(review_date)` = as.character(sen_byyear$`year(review_date)`)
sen_byyear$`year(review_date)` <- as.Date(sen_byyear$`year(review_date)`,format = "%Y")
ggplot(sen_byyear, aes(`year(review_date)`, sentiment)) +
geom_line()+
scale_x_date(date_breaks = "1 year", date_labels = "%Y")

顯示評論平均情緒分數在2006年時有下降趨勢,在最近的2018甚至是最低點
most common positive and negative words
bing_word_counts <- tidy_df %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
Joining, by = "word"
bing_word_counts
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
Selecting by n

word cloud
tidy_df %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
Joining, by = "word"

tidy_df %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
Joining, by = "word"

Ch3.Analyzing word and document frequency: tf-idf
#看看未去除stop words前的term frequency
df_words <- df %>%
unnest_tokens(word, review_body) %>%
count(id, word, sort = TRUE) %>%
ungroup()
total_words <- df_words %>%
group_by(id) %>%
summarize(total = sum(n))
book_words <- left_join(df_words, total_words)
Joining, by = "id"
book_words
n是word詞頻,total是id review的總字數
Zipf’s law
freq_by_rank <- book_words %>%
group_by(id) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
以一篇評論當作一篇文本來看,常見的stop words幾乎都是rank前幾名(相較其他字詞,出現頻率的rank)
rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
Call:
lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
Coefficients:
(Intercept) log10(rank)
-0.8067 -0.8256
bind tf-idf functin
book_words <- book_words %>%
bind_tf_idf(word,id, n)
book_words
book_words %>%
#select(-total) %>%
arrange(desc(tf_idf))
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
top_n(15) %>%
ungroup %>%
ggplot(aes(word, tf_idf)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
coord_flip()
Selecting by tf_idf

Ch4.Relationships between words: n-grams and correlations
df_bigrams <- df %>%
unnest_tokens(bigram, review_body, token = "ngrams", n = 2)
df_bigrams
df_bigrams %>%
count(bigram, sort = TRUE)
bigrams_separated <- df_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
df %>%
unnest_tokens(trigram,review_body, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
bigram_tf_idf <- bigrams_united %>%
count(id, bigram) %>%
bind_tf_idf(bigram, id, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
排在not後的字詞
AFINN <- get_sentiments("afinn")
not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE) %>%
ungroup()
not_words
not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()

不只not代表否定,加入其他否定字詞
negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE) %>%
ungroup()
negated_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
facet_wrap(~word1, scales = "free_y") +
xlab("Words preceded by \"negated words\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()

#save.image("tidytext_hawai.RData")
Visualizing a network of bigrams with ggraph
bigram_graph <- bigram_counts %>%
filter(n > 90) %>%
graph_from_data_frame()
bigram_graph
IGRAPH 314b2cf DN-- 215 189 --
+ attr: name (v/c), n (e/n)
+ edges from 314b2cf (vertex names):
[1] rainbow ->tower hawaiian->village hilton ->hawaiian
[4] ocean ->view diamond ->head waikiki ->beach
[7] tapa ->tower ali'i ->tower front ->desk
[10] resort ->fee walking ->distance friday ->night
[13] abc ->store ala ->moana kalia ->tower
[16] hilton ->honors ocean ->front head ->tower
[19] highly ->recommend abc ->stores super ->pool
[22] minute ->walk alii ->tower tropics ->bar
+ ... omitted several edges
library(ggraph)
set.seed(2017)
# 前處理還須把數字去掉
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)

set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()

Counting and correlating among reviews
library(widyr)
# count words co-occuring within sections
word_pairs <- tidy_df %>%
pairwise_count(word, id, sort = TRUE)
word_pairs
看出在每一則評論中,最常一起出現的兩個字
也可以查看某一單字最常和誰一起出現
word_pairs %>%
filter(item1 == "pool")
pairwise correlation
word_cors <- tidy_df %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, id, sort = TRUE)
word_cors
查看moana這個單字最常和誰一起出現
word_cors %>%
filter(item1 == "moana")
以長條圖排序 並排四個字和其他字的correlation
word_cors %>%
filter(item1 %in% c("moana", "louis", "waikiki", "shopping")) %>%
group_by(item1) %>%
top_n(6) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()
Selecting by correlation

set.seed(2016)
word_cors %>%
filter(correlation > .45) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()

