4.1 Tokenizing by n-gram
library(dplyr)
library(tidytext)
library(janeaustenr)
lyric_bigrams <- lyric_data[,1:6] %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
lyric_bigrams
4.1.1 Counting and filtering n-grams
lyric_bigrams %>%
count(bigram, sort = TRUE)
library(tidyr)
package 愼㸱愼㸵tidyr愼㸱愼㸶 was built under R version 3.4.4
bigrams_separated <-lyric_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
lyric_data[,1:6] %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
4.1.2 Analyzing bigrams
bigrams_filtered %>%
filter(word2 == "street") %>%
count(album, word1, sort = TRUE)
bigram_tf_idf <- bigrams_united %>%
count(album, bigram) %>%
bind_tf_idf(bigram, album, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
4.1.3 Using bigrams to provide context in sentiment analysis
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
AFINN <- get_sentiments("afinn")
AFINN
not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE) %>%
ungroup()
not_words
library(ggplot2)
not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()

negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE) %>%
ungroup()
4.1.4 Visualizing a network of bigrams with ggraph
library(igraph)
Attaching package: 愼㸱愼㸵igraph愼㸱愼㸶
The following object is masked from 愼㸱愼㸵package:tidyr愼㸱愼㸶:
crossing
The following objects are masked from 愼㸱愼㸵package:dplyr愼㸱愼㸶:
as_data_frame, groups, union
The following objects are masked from 愼㸱愼㸵package:stats愼㸱愼㸶:
decompose, spectrum
The following object is masked from 愼㸱愼㸵package:base愼㸱愼㸶:
union
# original counts
bigram_counts
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
filter(n > 20) %>%
graph_from_data_frame()
bigram_graph
IGRAPH 8466b6a DN-- 131 126 --
+ attr: name (v/c), n (e/n)
+ edges from 8466b6a (vertex names):
[1] la ->la yeah ->yeah
[3] da ->da prince ->miscellaneous
[5] na ->na ha ->ha
[7] baby ->baby doo ->doo
[9] ooh ->ooh rock ->rock
[11] hey ->hey sexy ->er
[13] dance ->dance 4 ->love
[15] ooh ->baby uh ->uh
+ ... omitted several edges
library(ggraph)
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)

set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()

4.1.5 Visualizing bigrams in other texts
library(dplyr)
library(tidyr)
library(tidytext)
library(ggplot2)
library(igraph)
library(ggraph)
count_bigrams <- function(dataset) {
dataset %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
bigrams %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
4.2.2 Pairwise correlation
# we need to filter for at least relatively common words first
word_cors <- lyric_section_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, section, sort = TRUE)
word_cors
word_cors %>%
filter(item1 == "alfred")
word_cors %>%
filter(item1 %in% c("alfred", "chelsea", "parker", "witness")) %>%
group_by(item1) %>%
top_n(6) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()
Selecting by correlation

set.seed(2016)
word_cors %>%
filter(correlation > .9) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()

