Web Scarping & Text Analysis

Load Library

library(rvest)
library(purrr)
library(dplyr)
library(tidyr)
library(stringr)
library(tidytext)
library(tokenizers)
library(stopwords)
library(reshape2) #comparison word cloud
library(RColorBrewer)
library(wordcloud)
library(ggplot2)
library(lattice)
library(gridExtra)
library(janeaustenr)
library(igraph)
library(ggraph)
library(qgraph)
library(networkD3)

Scrapping data

url_base <- "https://www.amazon.com/All-new-Echo-Dot-3rd-Gen/product-reviews/B0792KTHKJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&pageNumber=%d"

if (!file.exists("review.RData")){
map_df(1:142, function(i) {

  # simple but effective progress indicator
  cat(".")

  pg <- read_html(sprintf(url_base, i))

    data.frame(
              review=html_text(html_nodes(pg, ".review-text")),
             stringsAsFactors=FALSE)
}) -> review

# Save for later use
  save(review, file="review.RData")
} else {
  load(file="review.RData")
}

Cleaning text

# Construct a data frame using words appearing in reviews
review_t <- review %>% 
   dplyr::select(review) %>%
  mutate(reviews = str_replace(review, "\n", " ") %>% str_replace("<.*?>", " ")) %>%
  unnest_tokens(word,review)

review_t <- review_t[2]

#cleaning text
word_remove = c("just","also","overall","one","can","gen","generation","get","set")

review_t <- review_t %>% 
  filter(!(word %in% stopwords::stopwords())) %>%
  filter(!(word %in% word_remove)) %>%
  filter(!(str_detect(word, "[0-9]")))  # Remove numbers as well

top25_review<- review_t %>% count(word, sort = TRUE) %>% slice(1:25)

top25_review

## # A tibble: 25 x 2
##    word        n
##    <chr>   <int>
##  1 echo      575
##  2 dot       564
##  3 sound     498
##  4 alexa     443
##  5 music     388
##  6 love      361
##  7 great     352
##  8 like      289
##  9 speaker   287
## 10 use       260
## # ... with 15 more rows

simple word cloud

pal<- brewer.pal(8,"Dark2")

wordcloud(words = top25_review$word, freq =top25_review$n, color = pal)

echo_bing <- review_t %>%
        left_join(get_sentiments("bing")) %>%
       filter(!is.na(sentiment))

## Joining, by = "word"

Common Sentiment Words

echo_bing_counts <- review_t %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

echo_bing_counts$sentiment <- factor(echo_bing_counts$sentiment)
echo_bing_counts %>% select(sentiment,n)%>% group_by(sentiment) %>% summarise(positive_count = sum(n))

## # A tibble: 2 x 2
##   sentiment positive_count
##   <fct>              <int>
## 1 negative             995
## 2 positive            4079

echo_bing_counts %>%
        group_by(sentiment) %>%
        top_n(10) %>%
        ggplot(aes(reorder(word, n), n, fill = sentiment)) +
          geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
          facet_wrap(~sentiment, scales = "free_y") +
          labs(y = "Contribution to sentiment", x = NULL) +
          coord_flip()

## Selecting by n

Here I found one interesting thinng.Although “loud” is categorised to negative, in real context it’s actually a positive word.
For example, in this review “It’s louder with better bass response compared with the 2nd generation Dot that I also have.” the tone is positive. Uing lexicon is convenient; however,we need understand that the interpretation is always contextual.

Comparison Cloud

review_t%>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("tomato", "darkturquoise"),
                   max.words = 100)

## Joining, by = "word"

Amozon Echo dot (3rd Gen) vs. Google home mini

Google home mini is considered an alternative to echo dot, and both prices are similar. Thought it would be interesting to compare the customer review of both products.

The Google home mini review data is scraped from https://www.bestbuy.com/ , and it is pre-processed using the same procedure above.

#load data
load(file="mini_t.RData") # already tokenized

mini_bing <- mini_t %>%
        left_join(get_sentiments("bing")) %>%
       filter(!is.na(sentiment))

## Joining, by = "word"

mini_bing_counts <- mini_t %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

Comparison cloud- Google Home Mini

create a word cloud to have a quick view of the sentiment.

 mini_t%>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("tomato", "darkturquoise"),
                   max.words = 100)

## Joining, by = "word"

Over all sentiment comparison of 2 products

#prepare data for the graph 
proportion  <- rbind(echo_bing %>% mutate(product = "echo"), mini_bing %>% mutate(product = "mini"))

 ggplot(proportion, aes(x = product, fill = sentiment)) +
  geom_bar(position = "fill", width = 0.5) +ylab("proportion")+
  geom_hline(yintercept=0.785, linetype="dashed", 
                color = "black", size=0.2)

The sentiment propotion of 2 products are similar, where Echo slightly better than google home mini.

Words that contribute to positive and negative sentiment for both products

p2 <- echo_bing_counts %>%
        group_by(sentiment) %>%
        top_n(10) %>%
        ggplot(aes(reorder(word, n), n, fill = sentiment)) +
          geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
          facet_wrap(~sentiment, scales = "free_y") +
          labs( y = NULL ,x = NULL , title ="Amazon Echo Dot (3 gen)") +
          coord_flip()

## Selecting by n

p3 <- mini_bing_counts %>%
        group_by(sentiment) %>%
        top_n(10) %>%
        ggplot(aes(reorder(word, n), n, fill = sentiment)) +
          geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
          facet_wrap(~sentiment, scales = "free_y") +
          labs(y = "Contribution to sentiment", x = NULL, title = "Google Home Mini") +
          coord_flip()

## Selecting by n

grid.arrange(p2,p3, ncol = 1 )

The words in positive reviews of the two products are quite similar. It’s more interesting to note the negative reviews, Google home mini appears wording like “limitation” and “lack”. This could be of valuable reference for customers.

bigram

echo_bigrams <- review %>%
    unnest_tokens(bigram, review, token = "ngrams", n = 2)

echo_bigrams %>% count(bigram , sort = T) %>% slice(1:50)

## # A tibble: 50 x 2
##    bigram            n
##    <chr>         <int>
##  1 echo dot        259
##  2 the sound       185
##  3 i have          181
##  4 in the          157
##  5 the echo        157
##  6 i love          150
##  7 of the          147
##  8 sound quality   138
##  9 with the        125
## 10 i am            124
## # ... with 40 more rows

#cleaning the bigram
bigrams_separated <- echo_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts

## # A tibble: 3,670 x 3
##    word1 word2          n
##    <chr> <chr>      <int>
##  1 echo  dot          259
##  2 sound quality      138
##  3 2nd   gen           84
##  4 3rd   gen           72
##  5 2nd   generation    58
##  6 3rd   generation    46
##  7 gen   dot           40
##  8 echo  dots          37
##  9 alexa app           33
## 10 gen   2             32
## # ... with 3,660 more rows

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

trigram

echo_trigrams <- review %>%
    unnest_tokens(trigram, review, token = "ngrams", n = 3)

echo_trigrams %>% count(trigram , sort = T) %>% slice(1:50)

## # A tibble: 50 x 2
##    trigram                n
##    <chr>              <int>
##  1 the echo dot          78
##  2 the sound quality     65
##  3 the sound is          58
##  4 i love it             48
##  5 sound quality is      48
##  6 to set up             48
##  7 the 2nd gen           38
##  8 the 2nd generation    36
##  9 a lot of              34
## 10 better than the       32
## # ... with 40 more rows

#cleaning the bigram
trigrams_separated <- echo_trigrams %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ")

trigrams_filtered <- trigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word)

# new bigram counts:
trigram_counts <- trigrams_filtered %>% 
  count(word1, word2, word3, sort = TRUE)

trigram_counts

## # A tibble: 1,529 x 4
##    word1      word2      word3     n
##    <chr>      <chr>      <chr> <int>
##  1 dot        3rd        gen      24
##  2 echo       dot        3rd      16
##  3 gen        echo       dot      16
##  4 2nd        gen        dot      15
##  5 3rd        gen        dot      15
##  6 2nd        generation echo     13
##  7 2nd        gen        echo     12
##  8 3rd        generation dot      12
##  9 generation echo       dot      11
## 10 dot        2nd        gen      10
## # ... with 1,519 more rows

Seems that trigram doesn’t really provide more information than bigram in this dataset.

#negative words
negative_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE) %>%
  ungroup()

negative_words

## # A tibble: 28 x 3
##    word2     score     n
##    <chr>     <int> <int>
##  1 like          2     7
##  2 happy         3     6
##  3 clear         1     4
##  4 great         3     4
##  5 impressed     3     4
##  6 want          1     4
##  7 better        2     3
##  8 recommend     2     3
##  9 worth         2     3
## 10 bad          -3     2
## # ... with 18 more rows

negative_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
  filter(n > 15) %>%
  graph_from_data_frame()

bigram_graph

## IGRAPH 23c243e DN-- 32 28 -- 
## + attr: name (v/c), n (e/n)
## + edges from 23c243e (vertex names):
##  [1] echo      ->dot        sound     ->quality    2nd       ->gen       
##  [4] 3rd       ->gen        2nd       ->generation 3rd       ->generation
##  [7] gen       ->dot        echo      ->dots       alexa     ->app       
## [10] gen       ->2          dot       ->3rd        amazon    ->music     
## [13] google    ->home       play      ->music      generation->dot       
## [16] gen       ->echo       bluetooth ->speaker    playing   ->music     
## [19] smart     ->home       gen       ->3          generation->echo      
## [22] highly    ->recommend  customer  ->service    smart     ->plugs     
## + ... omitted several edges

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

g <- ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

g

make it into interactive network graph.

  wt <- cluster_walktrap(bigram_graph, steps = 6)
  members <- membership(wt)


sj_list <- igraph_to_networkD3(bigram_graph, group = members)


# Plot as a forceDirected Network
forceNetwork(Links = sj_list$links, Nodes = sj_list$nodes, Source = 'source',
             Target = 'target', NodeID = 'name', Group = 'group',
             zoom = TRUE, linkDistance = 200)