str(alexa)
## 'data.frame': 3150 obs. of 7 variables:
## $ rating : int 5 5 5 5 5 5 5 5 5 5 ...
## $ date : Factor w/ 77 levels "1-Jul-18","1-Jun-18",..: 17 17 17 17 17 20 20 20 20 20 ...
## $ variation : Factor w/ 16 levels "Black","Black Dot",..: 1 1 1 1 12 1 1 1 1 1 ...
## $ verified_reviews: Factor w/ 2300 levels " ",""NEVER BUY CERTIFIED AND REFURBISHED ECHO DOT " I bought a "Certified and Refurbished " Echo Do"| __truncated__,..: 1251 722 1583 1249 1193 1084 295 2214 461 90 ...
## $ feedback : int 1 1 1 1 1 1 1 1 1 1 ...
## $ week : num 20 20 20 20 20 20 20 20 20 20 ...
## $ day : int 1 1 1 1 1 2 2 2 2 2 ...
alexa<-alexa[which(alexa$feedback==1),]
alexa$date<-as.Date(alexa$date, format="%d-%b-%y")
alexa$feedback<-as.factor(alexa$feedback)
alexa$verified_reviews<-as.character(alexa$verified_reviews)
custom_stop_words <- bind_rows(data_frame(word = c("echo",
"alexa",
"dot",
"product",
"amazon",
"34",
"4.5",
"na",
"stick",
"fire",
"firestick"),
lexicon = c("custom")),
stop_words)
custom_stop_words
tidy_books <- alexa %>%
unnest_tokens(word, verified_reviews) %>%
anti_join(custom_stop_words)%>%
mutate(linenumber = day)
## Joining, by = "word"
head(tidy_books)
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 100) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
## Warning: package 'bindrcpp' was built under R version 3.5.1

jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(variation,index = linenumber %/% 1, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
jane_austen_sentiment
ggplot(jane_austen_sentiment, aes(index, sentiment, fill=variation)) +
geom_col(show.legend = FALSE) +
facet_wrap(~variation, ncol = 4, scales = "free_y")

bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n

bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(variation) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(variation) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("variation")) %>%
mutate(ratio = negativewords/words) %>%
top_n(10) %>%
ungroup()
## Joining, by = "word"
## Selecting by ratio
bingpositive <- get_sentiments("bing") %>%
filter(sentiment == "positive")
wordcounts <- tidy_books %>%
group_by(variation) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingpositive) %>%
group_by(variation) %>%
summarize(positivewords = n()) %>%
left_join(wordcounts, by = c("variation")) %>%
mutate(ratio = positivewords/words) %>%
top_n(10) %>%
ungroup()
## Joining, by = "word"
## Selecting by ratio
book_words <- alexa %>%
unnest_tokens(word, verified_reviews) %>%
anti_join(custom_stop_words) %>%
count(variation, word, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
total_words <- book_words %>%
group_by(variation) %>%
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
## Joining, by = "variation"
book_words
ggplot(book_words, aes(n/total, fill = variation)) +
geom_histogram(show.legend = FALSE, binwidth = 0.001) +
facet_wrap(~variation, ncol = 4, scales = "fixed")

freq_by_rank <- book_words %>%
group_by(variation) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = variation)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
scale_x_log10() +
scale_y_log10()

book_words <- book_words %>%
bind_tf_idf(word, variation, n)
book_words
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(variation) %>%
top_n(5) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = variation)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~variation, ncol = 4, scales = "free") +
coord_flip()
## Selecting by tf_idf

alx<-tidy_books %>%
group_by(variation) %>%
count(word)
head(alx)
m <- alx %>%
cast_dtm(variation, word, n)
ap_lda <- LDA(m, k = 2, control = list(seed = 176))
ap_lda
## A LDA_VEM topic model with 2 topics.
ap_topics <- tidy(ap_lda, matrix = "beta")
ap_topics
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()

beta_spread <- ap_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
beta_spread %>%
arrange(desc(log_ratio)) %>%
head(20) %>%
mutate(word2 = reorder(term, log_ratio)) %>%
ggplot(aes(word2, log_ratio)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()

top<-beta_spread %>%
arrange(desc(log_ratio)) %>%
head(10)
top
bot<-beta_spread %>%
arrange(desc(log_ratio)) %>%
tail(10)
bot
t.diff<-rbind(top,bot)
t.diff
t.diff %>%
mutate(word2 = reorder(term, log_ratio)) %>%
ggplot(aes(word2, log_ratio)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()

alexa_bigrams <- alexa %>%
unnest_tokens(word, verified_reviews, token = "ngrams", n = 2)%>%
mutate(linenumber = day)
alexa_bigrams
alexa_bigrams<-na.omit(alexa_bigrams)
alexa_bigrams %>%
count(word, sort = TRUE)
bigrams_separated <- alexa_bigrams %>%
separate(word, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% custom_stop_words$word) %>%
filter(!word2 %in% custom_stop_words$word)
bigrams_filtered[1:10,]
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united[1:10,]
alexa %>%
unnest_tokens(trigram, verified_reviews, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE) %>%
na.omit()
bigrams_filtered %>%
filter(word2 == "music") %>%
count(variation, word1, sort = TRUE)
bigram_tf_idf <- bigrams_united %>%
count(variation, bigram) %>%
bind_tf_idf(bigram, variation, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
book_words <- bigrams_united %>%
count(variation, bigram, sort = TRUE) %>%
ungroup()
total_words <- book_words %>%
group_by(variation) %>%
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
## Joining, by = "variation"
book_words
book_words <- book_words %>%
bind_tf_idf(bigram, variation, n)
book_words
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(variation) %>%
top_n(10) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = variation)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~variation, ncol = 4, scales = "free") +
coord_flip()
## Selecting by word

bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
AFINN <- get_sentiments("afinn")
AFINN
not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE) %>%
ungroup()
not_words
not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()

negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE) %>%
ungroup()
bigram_counts
bigram_graph <- bigram_counts %>%
filter(n > 5) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH 81cb978 DN-- 101 75 --
## + attr: name (v/c), n (e/n)
## + edges from 81cb978 (vertex names):
## [1] sound ->quality prime ->day play ->music
## [4] alarm ->clock smart ->home playing ->music
## [7] absolutely->love love ->love easy ->set
## [10] 2nd ->generation bluetooth ->speaker easy ->setup
## [13] google ->home highly ->recommend video ->chat
## [16] light ->bulbs blue ->tooth smart ->plugs
## [19] super ->easy user ->friendly smart ->devices
## [22] smart ->speaker speaker ->quality cell ->phone
## + ... omitted several edges
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)

set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(0.07, 'inches')) +
geom_node_point(color = "lightgreen", size = 4) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()

austen_section_words <- alexa %>%
mutate(section = day) %>%
filter(section > 0) %>%
unnest_tokens(word, verified_reviews) %>%
filter(!word %in% stop_words$word)
head(austen_section_words)
word_pairs <- austen_section_words %>%
pairwise_count(word, section, sort = TRUE)
word_pairs
word_pairs %>%
filter(item1 == "love")
word_cors <- austen_section_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, section, sort = TRUE)
word_cors
word_cors %>%
filter(item1 == "love")
word_cors %>%
filter(item1 %in% c("music", "love", "watch", "tv")) %>%
group_by(item1) %>%
top_n(6) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()
## Selecting by correlation

word_cors %>%
filter(correlation > .7) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
