Hey @drob, can R analyze @beck lyrics to find patterns? I love it but i dont know wha any of this shit means.
— Cakey McTeaseface (@Cake_Tease) September 15, 2016
Extract and parse lyrics from some popular Beck songs:
library(rvest)
library(stringr)
library(dplyr)
library(purrr)
library(tidyr)
library(tidytext)
library(tokenizers)
song_titles <- read_html("http://www.metrolyrics.com/beck-lyrics.html") %>%
html_nodes(".title") %>%
html_text() %>%
str_replace("( Lyrics|\\')", "") %>%
str_trim()
extract_verses <- function(link) {
read_html(link) %>%
html_nodes(".verse") %>%
html_text()
}
beck_verses <- data_frame(title = unique(song_titles)) %>%
filter(title != "Looser") %>%
mutate(converted = map_chr(tokenize_words(title), paste, collapse = "-")) %>%
mutate(link = paste0("http://www.metrolyrics.com/", converted, "-lyrics-beck.html")) %>%
unnest(text = map(link, possibly(extract_verses, NULL))) %>%
select(title, text) %>%
mutate(title = str_replace(title, "Mooon", "Moon")) %>%
group_by(title) %>%
mutate(verse = row_number())
beck_words <- beck_verses %>%
unnest_tokens(text, text, token = "lines") %>%
group_by(title) %>%
mutate(line = row_number()) %>%
unnest_tokens(word, text, token = "words") %>%
ungroup()
beck_words
## # A tibble: 2,264 × 4
## title verse line word
## <chr> <int> <int> <chr>
## 1 Blue Moon 1 1 i'm
## 2 Blue Moon 1 1 so
## 3 Blue Moon 1 1 tired
## 4 Blue Moon 1 1 of
## 5 Blue Moon 1 1 being
## 6 Blue Moon 1 1 alone
## 7 Blue Moon 1 2 these
## 8 Blue Moon 1 2 penitent
## 9 Blue Moon 1 2 walls
## 10 Blue Moon 1 2 are
## # ... with 2,254 more rows
Count most common words (not including universally common words like “the” and “and”):
beck_words %>%
anti_join(stop_words) %>%
count(word, sort = TRUE)
## # A tibble: 514 × 2
## word n
## <chr> <int>
## 1 microphone 20
## 2 turntables 18
## 3 wow 14
## 4 ramona 12
## 5 day 11
## 6 baby 10
## 7 hell 9
## 8 kill 9
## 9 loser 9
## 10 soy 9
## # ... with 504 more rows
The map of Beck words:
beck_digrams <- beck_verses %>%
unnest_tokens(digram, text, token = "ngrams", n = 2) %>%
separate(digram, c("word1", "word2"), sep = " ")
beck_digrams
## # A tibble: 2,177 × 4
## title verse word1 word2
## * <chr> <int> <chr> <chr>
## 1 Blue Moon 1 i'm so
## 2 Blue Moon 1 so tired
## 3 Blue Moon 1 tired of
## 4 Blue Moon 1 of being
## 5 Blue Moon 1 being alone
## 6 Blue Moon 1 alone these
## 7 Blue Moon 1 these penitent
## 8 Blue Moon 1 penitent walls
## 9 Blue Moon 1 walls are
## 10 Blue Moon 1 are all
## # ... with 2,167 more rows
library(grid)
library(ggraph)
library(igraph)
set.seed(2016 - 09 - 15)
a <- arrow(length = unit(.05, "inches"), type = "closed")
beck_digrams %>%
count(word1, word2, sort = TRUE) %>%
ungroup() %>%
filter(!word1 %in% tm::stopwords() | !word2 %in% tm::stopwords()) %>%
filter(n > 3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(arrow = a) +
geom_node_point(color = "lightblue") +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
What are the happiest and saddest songs?
library(ggplot2)
afinn <- sentiments %>%
filter(lexicon == "AFINN")
beck_words %>%
inner_join(afinn, by = "word") %>%
group_by(title) %>%
summarize(positivity = mean(score)) %>%
mutate(title = reorder(title, positivity)) %>%
ggplot(aes(title, positivity, fill = positivity > 0)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
xlab("") +
ylab("Positivity") +
theme_bw()