library(rvest)
library(purrr)
library(dplyr)
library(tidyr)
library(stringr)
library(tidytext)
library(tokenizers)
library(stopwords)
library(reshape2) #comparison word cloud
library(RColorBrewer)
library(wordcloud)
library(ggplot2)
library(lattice)
library(gridExtra)
library(janeaustenr)
library(igraph)
library(ggraph)
library(qgraph)
library(networkD3)
url_base <- "https://www.amazon.com/All-new-Echo-Dot-3rd-Gen/product-reviews/B0792KTHKJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&pageNumber=%d"
if (!file.exists("review.RData")){
map_df(1:142, function(i) {
# simple but effective progress indicator
cat(".")
pg <- read_html(sprintf(url_base, i))
data.frame(
review=html_text(html_nodes(pg, ".review-text")),
stringsAsFactors=FALSE)
}) -> review
# Save for later use
save(review, file="review.RData")
} else {
load(file="review.RData")
}
# Construct a data frame using words appearing in reviews
review_t <- review %>%
dplyr::select(review) %>%
mutate(reviews = str_replace(review, "\n", " ") %>% str_replace("<.*?>", " ")) %>%
unnest_tokens(word,review)
review_t <- review_t[2]
#cleaning text
word_remove = c("just","also","overall","one","can","gen","generation","get","set")
review_t <- review_t %>%
filter(!(word %in% stopwords::stopwords())) %>%
filter(!(word %in% word_remove)) %>%
filter(!(str_detect(word, "[0-9]"))) # Remove numbers as well
top25_review<- review_t %>% count(word, sort = TRUE) %>% slice(1:25)
top25_review
## # A tibble: 25 x 2
## word n
## <chr> <int>
## 1 echo 575
## 2 dot 564
## 3 sound 498
## 4 alexa 443
## 5 music 388
## 6 love 361
## 7 great 352
## 8 like 289
## 9 speaker 287
## 10 use 260
## # ... with 15 more rows
simple word cloud
pal<- brewer.pal(8,"Dark2")
wordcloud(words = top25_review$word, freq =top25_review$n, color = pal)
echo_bing <- review_t %>%
left_join(get_sentiments("bing")) %>%
filter(!is.na(sentiment))
## Joining, by = "word"
echo_bing_counts <- review_t %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
echo_bing_counts$sentiment <- factor(echo_bing_counts$sentiment)
echo_bing_counts %>% select(sentiment,n)%>% group_by(sentiment) %>% summarise(positive_count = sum(n))
## # A tibble: 2 x 2
## sentiment positive_count
## <fct> <int>
## 1 negative 995
## 2 positive 4079
echo_bing_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ggplot(aes(reorder(word, n), n, fill = sentiment)) +
geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment", x = NULL) +
coord_flip()
## Selecting by n
Here I found one interesting thinng.Although “loud” is categorised to negative, in real context it’s actually a positive word.
For example, in this review “It’s louder with better bass response compared with the 2nd generation Dot that I also have.” the tone is positive. Uing lexicon is convenient; however,we need understand that the interpretation is always contextual.
review_t%>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("tomato", "darkturquoise"),
max.words = 100)
## Joining, by = "word"
Google home mini is considered an alternative to echo dot, and both prices are similar. Thought it would be interesting to compare the customer review of both products.
The Google home mini review data is scraped from https://www.bestbuy.com/ , and it is pre-processed using the same procedure above.
#load data
load(file="mini_t.RData") # already tokenized
mini_bing <- mini_t %>%
left_join(get_sentiments("bing")) %>%
filter(!is.na(sentiment))
## Joining, by = "word"
mini_bing_counts <- mini_t %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
create a word cloud to have a quick view of the sentiment.
mini_t%>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("tomato", "darkturquoise"),
max.words = 100)
## Joining, by = "word"
#prepare data for the graph
proportion <- rbind(echo_bing %>% mutate(product = "echo"), mini_bing %>% mutate(product = "mini"))
ggplot(proportion, aes(x = product, fill = sentiment)) +
geom_bar(position = "fill", width = 0.5) +ylab("proportion")+
geom_hline(yintercept=0.785, linetype="dashed",
color = "black", size=0.2)
The sentiment propotion of 2 products are similar, where Echo slightly better than google home mini.
p2 <- echo_bing_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ggplot(aes(reorder(word, n), n, fill = sentiment)) +
geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs( y = NULL ,x = NULL , title ="Amazon Echo Dot (3 gen)") +
coord_flip()
## Selecting by n
p3 <- mini_bing_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ggplot(aes(reorder(word, n), n, fill = sentiment)) +
geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment", x = NULL, title = "Google Home Mini") +
coord_flip()
## Selecting by n
grid.arrange(p2,p3, ncol = 1 )
The words in positive reviews of the two products are quite similar. It’s more interesting to note the negative reviews, Google home mini appears wording like “limitation” and “lack”. This could be of valuable reference for customers.
echo_bigrams <- review %>%
unnest_tokens(bigram, review, token = "ngrams", n = 2)
echo_bigrams %>% count(bigram , sort = T) %>% slice(1:50)
## # A tibble: 50 x 2
## bigram n
## <chr> <int>
## 1 echo dot 259
## 2 the sound 185
## 3 i have 181
## 4 in the 157
## 5 the echo 157
## 6 i love 150
## 7 of the 147
## 8 sound quality 138
## 9 with the 125
## 10 i am 124
## # ... with 40 more rows
#cleaning the bigram
bigrams_separated <- echo_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
## # A tibble: 3,670 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 echo dot 259
## 2 sound quality 138
## 3 2nd gen 84
## 4 3rd gen 72
## 5 2nd generation 58
## 6 3rd generation 46
## 7 gen dot 40
## 8 echo dots 37
## 9 alexa app 33
## 10 gen 2 32
## # ... with 3,660 more rows
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
echo_trigrams <- review %>%
unnest_tokens(trigram, review, token = "ngrams", n = 3)
echo_trigrams %>% count(trigram , sort = T) %>% slice(1:50)
## # A tibble: 50 x 2
## trigram n
## <chr> <int>
## 1 the echo dot 78
## 2 the sound quality 65
## 3 the sound is 58
## 4 i love it 48
## 5 sound quality is 48
## 6 to set up 48
## 7 the 2nd gen 38
## 8 the 2nd generation 36
## 9 a lot of 34
## 10 better than the 32
## # ... with 40 more rows
#cleaning the bigram
trigrams_separated <- echo_trigrams %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ")
trigrams_filtered <- trigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word)
# new bigram counts:
trigram_counts <- trigrams_filtered %>%
count(word1, word2, word3, sort = TRUE)
trigram_counts
## # A tibble: 1,529 x 4
## word1 word2 word3 n
## <chr> <chr> <chr> <int>
## 1 dot 3rd gen 24
## 2 echo dot 3rd 16
## 3 gen echo dot 16
## 4 2nd gen dot 15
## 5 3rd gen dot 15
## 6 2nd generation echo 13
## 7 2nd gen echo 12
## 8 3rd generation dot 12
## 9 generation echo dot 11
## 10 dot 2nd gen 10
## # ... with 1,519 more rows
Seems that trigram doesn’t really provide more information than bigram in this dataset.
#negative words
negative_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE) %>%
ungroup()
negative_words
## # A tibble: 28 x 3
## word2 score n
## <chr> <int> <int>
## 1 like 2 7
## 2 happy 3 6
## 3 clear 1 4
## 4 great 3 4
## 5 impressed 3 4
## 6 want 1 4
## 7 better 2 3
## 8 recommend 2 3
## 9 worth 2 3
## 10 bad -3 2
## # ... with 18 more rows
negative_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
filter(n > 15) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH 23c243e DN-- 32 28 --
## + attr: name (v/c), n (e/n)
## + edges from 23c243e (vertex names):
## [1] echo ->dot sound ->quality 2nd ->gen
## [4] 3rd ->gen 2nd ->generation 3rd ->generation
## [7] gen ->dot echo ->dots alexa ->app
## [10] gen ->2 dot ->3rd amazon ->music
## [13] google ->home play ->music generation->dot
## [16] gen ->echo bluetooth ->speaker playing ->music
## [19] smart ->home gen ->3 generation->echo
## [22] highly ->recommend customer ->service smart ->plugs
## + ... omitted several edges
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
g <- ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
g
wt <- cluster_walktrap(bigram_graph, steps = 6)
members <- membership(wt)
sj_list <- igraph_to_networkD3(bigram_graph, group = members)
# Plot as a forceDirected Network
forceNetwork(Links = sj_list$links, Nodes = sj_list$nodes, Source = 'source',
Target = 'target', NodeID = 'name', Group = 'group',
zoom = TRUE, linkDistance = 200)