Hey @drob, can R analyze @beck lyrics to find patterns? I love it but i dont know wha any of this shit means.

— Cakey McTeaseface (@Cake_Tease) September 15, 2016

Extract and parse lyrics from some popular Beck songs:

library(rvest)
library(stringr)
library(dplyr)
library(purrr)
library(tidyr)
library(tidytext)
library(tokenizers)

song_titles <- read_html("http://www.metrolyrics.com/beck-lyrics.html") %>%
  html_nodes(".title") %>%
  html_text() %>%
  str_replace("( Lyrics|\\')", "") %>%
  str_trim()

extract_verses <- function(link) {
  read_html(link) %>%
    html_nodes(".verse") %>%
    html_text()
}

beck_verses <- data_frame(title = unique(song_titles)) %>%
  filter(title != "Looser") %>%
  mutate(converted = map_chr(tokenize_words(title), paste, collapse = "-")) %>%
  mutate(link = paste0("http://www.metrolyrics.com/", converted, "-lyrics-beck.html")) %>%
  unnest(text = map(link, possibly(extract_verses, NULL))) %>%
  select(title, text) %>%
  mutate(title = str_replace(title, "Mooon", "Moon")) %>%
  group_by(title) %>%
  mutate(verse = row_number())

beck_words <- beck_verses %>%
  unnest_tokens(text, text, token = "lines") %>%
  group_by(title) %>%
  mutate(line = row_number()) %>%
  unnest_tokens(word, text, token = "words") %>%
  ungroup()

beck_words
## # A tibble: 2,264 × 4
##        title verse  line     word
##        <chr> <int> <int>    <chr>
## 1  Blue Moon     1     1      i'm
## 2  Blue Moon     1     1       so
## 3  Blue Moon     1     1    tired
## 4  Blue Moon     1     1       of
## 5  Blue Moon     1     1    being
## 6  Blue Moon     1     1    alone
## 7  Blue Moon     1     2    these
## 8  Blue Moon     1     2 penitent
## 9  Blue Moon     1     2    walls
## 10 Blue Moon     1     2      are
## # ... with 2,254 more rows

Count most common words (not including universally common words like “the” and “and”):

beck_words %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE)
## # A tibble: 514 × 2
##          word     n
##         <chr> <int>
## 1  microphone    20
## 2  turntables    18
## 3         wow    14
## 4      ramona    12
## 5         day    11
## 6        baby    10
## 7        hell     9
## 8        kill     9
## 9       loser     9
## 10        soy     9
## # ... with 504 more rows

The map of Beck words:

beck_digrams <- beck_verses %>%
  unnest_tokens(digram, text, token = "ngrams", n = 2) %>%
  separate(digram, c("word1", "word2"), sep = " ")

beck_digrams
## # A tibble: 2,177 × 4
##        title verse    word1    word2
## *      <chr> <int>    <chr>    <chr>
## 1  Blue Moon     1      i'm       so
## 2  Blue Moon     1       so    tired
## 3  Blue Moon     1    tired       of
## 4  Blue Moon     1       of    being
## 5  Blue Moon     1    being    alone
## 6  Blue Moon     1    alone    these
## 7  Blue Moon     1    these penitent
## 8  Blue Moon     1 penitent    walls
## 9  Blue Moon     1    walls      are
## 10 Blue Moon     1      are      all
## # ... with 2,167 more rows
library(grid)
library(ggraph)
library(igraph)

set.seed(2016 - 09 - 15)

a <- arrow(length = unit(.05, "inches"), type = "closed")

beck_digrams %>%
  count(word1, word2, sort = TRUE) %>%
  ungroup() %>%
  filter(!word1 %in% tm::stopwords() | !word2 %in% tm::stopwords()) %>%
  filter(n > 3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(arrow = a) +
  geom_node_point(color = "lightblue") +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

What are the happiest and saddest songs?

library(ggplot2)

afinn <- sentiments %>%
  filter(lexicon == "AFINN")

beck_words %>%
  inner_join(afinn, by = "word") %>%
  group_by(title) %>%
  summarize(positivity = mean(score)) %>%
  mutate(title = reorder(title, positivity)) %>%
  ggplot(aes(title, positivity, fill = positivity > 0)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  xlab("") +
  ylab("Positivity") +
  theme_bw()