str(alexa)
## 'data.frame':    3150 obs. of  7 variables:
##  $ rating          : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ date            : Factor w/ 77 levels "1-Jul-18","1-Jun-18",..: 17 17 17 17 17 20 20 20 20 20 ...
##  $ variation       : Factor w/ 16 levels "Black","Black  Dot",..: 1 1 1 1 12 1 1 1 1 1 ...
##  $ verified_reviews: Factor w/ 2300 levels " ",""NEVER BUY CERTIFIED AND REFURBISHED ECHO DOT " I bought a "Certified and Refurbished " Echo Do"| __truncated__,..: 1251 722 1583 1249 1193 1084 295 2214 461 90 ...
##  $ feedback        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ week            : num  20 20 20 20 20 20 20 20 20 20 ...
##  $ day             : int  1 1 1 1 1 2 2 2 2 2 ...
alexa<-alexa[which(alexa$feedback==1),]

alexa$date<-as.Date(alexa$date, format="%d-%b-%y")
alexa$feedback<-as.factor(alexa$feedback)
alexa$verified_reviews<-as.character(alexa$verified_reviews)

custom_stop_words <- bind_rows(data_frame(word = c("echo", 
                                                   "alexa",
                                                   "dot",
                                                   "product",
                                                   "amazon",
                                                   "34",
                                                   "4.5",
                                                   "na",
                                                   "stick",
                                                   "fire",
                                                   "firestick"), 
                                          lexicon = c("custom")), 
                               stop_words)

custom_stop_words
tidy_books <- alexa %>%
  unnest_tokens(word, verified_reviews) %>%
  anti_join(custom_stop_words)%>%
  mutate(linenumber = day) 
## Joining, by = "word"
head(tidy_books)
tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 100) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()
## Warning: package 'bindrcpp' was built under R version 3.5.1

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(variation,index = linenumber %/% 1, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
jane_austen_sentiment
ggplot(jane_austen_sentiment, aes(index, sentiment, fill=variation)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~variation, ncol = 4, scales = "free_y")

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts
bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Selecting by n

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(variation) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(variation) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("variation")) %>%
  mutate(ratio = negativewords/words) %>%
  top_n(10) %>%
  ungroup()
## Joining, by = "word"
## Selecting by ratio
bingpositive <- get_sentiments("bing") %>% 
  filter(sentiment == "positive")

wordcounts <- tidy_books %>%
  group_by(variation) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingpositive) %>%
  group_by(variation) %>%
  summarize(positivewords = n()) %>%
  left_join(wordcounts, by = c("variation")) %>%
  mutate(ratio = positivewords/words) %>%
  top_n(10) %>%
  ungroup()
## Joining, by = "word"
## Selecting by ratio
book_words <- alexa %>%
  unnest_tokens(word, verified_reviews) %>%
  anti_join(custom_stop_words) %>%
  count(variation, word, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
total_words <- book_words %>% 
  group_by(variation) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)
## Joining, by = "variation"
book_words
ggplot(book_words, aes(n/total, fill = variation)) +
  geom_histogram(show.legend = FALSE, binwidth = 0.001) +
  facet_wrap(~variation, ncol = 4, scales = "fixed")

freq_by_rank <- book_words %>% 
  group_by(variation) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = variation)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) + 
  scale_x_log10() +
  scale_y_log10()

book_words <- book_words %>%
  bind_tf_idf(word, variation, n)

book_words
book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(variation) %>% 
  top_n(5) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = variation)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~variation, ncol = 4, scales = "free") +
  coord_flip()
## Selecting by tf_idf

alx<-tidy_books %>%
  group_by(variation) %>%
  count(word)
head(alx)
m <- alx %>%
  cast_dtm(variation, word, n)

ap_lda <- LDA(m, k = 2, control = list(seed = 176))
ap_lda
## A LDA_VEM topic model with 2 topics.
ap_topics <- tidy(ap_lda, matrix = "beta")
ap_topics
ap_top_terms <- ap_topics %>%
  group_by(topic) %>%
  top_n(15, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

ap_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

beta_spread <- ap_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1)) 
beta_spread
beta_spread %>%
  arrange(desc(log_ratio)) %>%
  head(20) %>%
  mutate(word2 = reorder(term, log_ratio)) %>%
  ggplot(aes(word2, log_ratio)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

top<-beta_spread %>%
  arrange(desc(log_ratio)) %>%
  head(10) 
top
bot<-beta_spread %>%
  arrange(desc(log_ratio)) %>%
  tail(10) 
bot
t.diff<-rbind(top,bot)
t.diff
t.diff %>%
  mutate(word2 = reorder(term, log_ratio)) %>%
  ggplot(aes(word2, log_ratio)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

alexa_bigrams <- alexa %>%
  unnest_tokens(word, verified_reviews, token = "ngrams", n = 2)%>%
  mutate(linenumber = day) 

alexa_bigrams
alexa_bigrams<-na.omit(alexa_bigrams)
alexa_bigrams %>%
  count(word, sort = TRUE)
bigrams_separated <- alexa_bigrams %>%
  separate(word, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% custom_stop_words$word) %>%
  filter(!word2 %in% custom_stop_words$word)
bigrams_filtered[1:10,]
# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts
bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigrams_united[1:10,]
alexa %>%
  unnest_tokens(trigram, verified_reviews, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE) %>%
  na.omit()
bigrams_filtered %>%
  filter(word2 == "music") %>%
  count(variation, word1, sort = TRUE)
bigram_tf_idf <- bigrams_united %>%
  count(variation, bigram) %>%
  bind_tf_idf(bigram, variation, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf
book_words <- bigrams_united %>%
  count(variation, bigram, sort = TRUE) %>%
  ungroup()

total_words <- book_words %>% 
  group_by(variation) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)
## Joining, by = "variation"
book_words
book_words <- book_words %>%
  bind_tf_idf(bigram, variation, n)
book_words
book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(variation) %>% 
  top_n(10) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = variation)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~variation, ncol = 4, scales = "free") +
  coord_flip()
## Selecting by word

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
AFINN <- get_sentiments("afinn")

AFINN
not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE) %>%
  ungroup()
not_words
not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

negation_words <- c("not", "no", "never", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = TRUE) %>%
  ungroup()

bigram_counts
bigram_graph <- bigram_counts %>%
  filter(n > 5) %>%
  graph_from_data_frame()

bigram_graph
## IGRAPH 81cb978 DN-- 101 75 -- 
## + attr: name (v/c), n (e/n)
## + edges from 81cb978 (vertex names):
##  [1] sound     ->quality    prime     ->day        play      ->music     
##  [4] alarm     ->clock      smart     ->home       playing   ->music     
##  [7] absolutely->love       love      ->love       easy      ->set       
## [10] 2nd       ->generation bluetooth ->speaker    easy      ->setup     
## [13] google    ->home       highly    ->recommend  video     ->chat      
## [16] light     ->bulbs      blue      ->tooth      smart     ->plugs     
## [19] super     ->easy       user      ->friendly   smart     ->devices   
## [22] smart     ->speaker    speaker   ->quality    cell      ->phone     
## + ... omitted several edges
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

set.seed(2016)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(0.07, 'inches')) +
  geom_node_point(color = "lightgreen", size = 4) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

austen_section_words <- alexa %>%
  mutate(section = day) %>%
  filter(section > 0) %>%
  unnest_tokens(word, verified_reviews) %>%
  filter(!word %in% stop_words$word)

head(austen_section_words)
word_pairs <- austen_section_words %>%
  pairwise_count(word, section, sort = TRUE)

word_pairs
word_pairs %>%
  filter(item1 == "love")
word_cors <- austen_section_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, section, sort = TRUE)

word_cors
word_cors %>%
  filter(item1 == "love")
word_cors %>%
  filter(item1 %in% c("music", "love", "watch", "tv")) %>%
  group_by(item1) %>%
  top_n(6) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()
## Selecting by correlation

word_cors %>%
  filter(correlation > .7) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()