library(geniusr)                         # This package gets lyrics
library(tidyverse)
library(tidytext)
library(wordcloud2)

genius_token()
[1] "a4tjUC6-o2Q2woojDmmId0UtiQNH6kLYA-S6yNW2SHNIcyxLZWkxvMzXp5-3yJxY"

Assigment:

Pick an album of your choice - I recommend one with lots of lyrics. Rap albums are great for this.

  1. Find the album and get the lyrics, and unnest them.
  2. Clean the lyrics by removing stopwords, and then create a table and word cloud with the words counts.
  3. Do sentiment analyses using bing and nrc, and create graphs of the words that contribute most to each sentiment.
  4. Create bigrams of the lyrics, remove the stopwords, and create a table and word cloud of the most common bigrams.
  5. Use the bigram method to find the most common words that come after words of your choice, like i/you or he/she.

Step 1) Find the album and get the lyrics, and unnest them.

search_song("mansion")
get_song_meta(725820)
mansion_tracks <- scrape_tracklist(119880)
argument is not an atomic vector; coercing
mansion_tracks
mansion_lyrics <- map_df(mansion_tracks$song_lyrics_url, scrape_lyrics_url)
mansion_lyrics
NA
mansion_words <- mansion_lyrics %>%
  unnest_tokens(word, line) %>% 
  select(song_name, word)

mansion_words

Step 2) Clean the lyrics by removing stopwords, and then create a table and word cloud with the words counts.

mansion_words %>% 
  anti_join(get_stopwords()) %>% 
  count(word, sort = T)
Joining, by = "word"
mansion_words %>% 
  anti_join(get_stopwords()) %>% 
  count(word, sort = T) %>% 
  top_n(200) %>% 
  wordcloud2(size = .5)
Joining, by = "word"
Selecting by n

Step 3. Do sentiment analyses using bing and nrc, and create graphs of the words that contribute most to each sentiment.

bing <- get_sentiments("bing")

mansion_words %>% 
  inner_join(bing) %>% 
  count(word, sentiment, sort = TRUE)
Joining, by = "word"
mansion_words %>% 
  inner_join(bing) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(sentiment), scales = "free") +
  labs(y = "NF mansion album: Words that contribute the most to each sentiment",
       x = NULL) +
  scale_fill_viridis_d() +
  coord_flip() +
  theme_minimal()
Joining, by = "word"
Selecting by n

nrc <- get_sentiments("nrc")
mansion_words %>% 
  inner_join(nrc) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  group_by(sentiment) %>%
  top_n(3) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(sentiment), scales = "free") +
  labs(y = "NF's mansion album: Words that contribute the most to each sentiment",
       x = NULL) +
  scale_fill_viridis_d() +
  coord_flip() +
  theme_minimal()
Joining, by = "word"
Selecting by n

Step 4. 4. Create bigrams of the lyrics, remove the stopwords, and create a table and word cloud of the most common bigrams.

mansion_lyrics %>%
  unnest_tokens(bigram, line, token = "ngrams", n = 2) %>% 
  select(bigram) -> mansion_bigrams
mansion_bigrams %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  unite(bigram, word1, word2, sep = " ") %>% 
  count(bigram, sort = T)
NA
mansion_bigrams %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  unite(bigram, word1, word2, sep = " ") %>% 
  count(bigram, sort = T) %>% 
  filter(n > 1) %>% 
  wordcloud2(size = 1)

Step 5. Use the bigram method to find the most common words that come after words of your choice, like i/you or he/she.

first_word <- c("i", "you")                                  # these need to be lowercase

mansion_bigrams %>% 
  count(bigram, sort = T) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>%       # separate the two words
  filter(word1 %in% first_word) %>%                          # find first words from our list
  count(word1, word2, wt = n, sort = TRUE) %>% 
  rename(total = nn) %>% 


  mutate(word2 = factor(word2, levels = rev(unique(word2)))) %>%     # put the words in order
  group_by(word1) %>% 
  top_n(3) %>% 
  ggplot(aes(word2, total, fill = word1)) +                          #
  scale_fill_viridis_d() +                                           # set the color palette
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL, title = "Word following:") +
  facet_wrap(~word1, scales = "free") +
  coord_flip() +
  theme_minimal()
Selecting by total

LS0tCnRpdGxlOiAiaW5kZXBlbmRlbnQgbHlyaWMgYW5hbHlzaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQpsaWJyYXJ5KGdlbml1c3IpICAgICAgICAgICAgICAgICAgICAgICAgICMgVGhpcyBwYWNrYWdlIGdldHMgbHlyaWNzCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHRpZHl0ZXh0KQpsaWJyYXJ5KHdvcmRjbG91ZDIpCmBgYAoKYGBge3J9CgpnZW5pdXNfdG9rZW4oKQpgYGAKQXNzaWdtZW50OgoKUGljayBhbiBhbGJ1bSBvZiB5b3VyIGNob2ljZSAtIEkgcmVjb21tZW5kIG9uZSB3aXRoIGxvdHMgb2YgbHlyaWNzLiBSYXAgYWxidW1zIGFyZSBncmVhdCBmb3IgdGhpcy4KCjEuIEZpbmQgdGhlIGFsYnVtIGFuZCBnZXQgdGhlIGx5cmljcywgYW5kIHVubmVzdCB0aGVtLiAgCjIuIENsZWFuIHRoZSBseXJpY3MgYnkgcmVtb3Zpbmcgc3RvcHdvcmRzLCBhbmQgdGhlbiBjcmVhdGUgYSB0YWJsZSBhbmQgd29yZCBjbG91ZCB3aXRoIHRoZSB3b3JkcyBjb3VudHMuICAKMy4gRG8gc2VudGltZW50IGFuYWx5c2VzIHVzaW5nIGJpbmcgYW5kIG5yYywgYW5kIGNyZWF0ZSBncmFwaHMgb2YgdGhlIHdvcmRzIHRoYXQgY29udHJpYnV0ZSBtb3N0IHRvIGVhY2ggc2VudGltZW50LiAgCjQuIENyZWF0ZSBiaWdyYW1zIG9mIHRoZSBseXJpY3MsIHJlbW92ZSB0aGUgc3RvcHdvcmRzLCBhbmQgY3JlYXRlIGEgdGFibGUgYW5kIHdvcmQgY2xvdWQgb2YgdGhlIG1vc3QgY29tbW9uIGJpZ3JhbXMuICAKNS4gVXNlIHRoZSBiaWdyYW0gbWV0aG9kIHRvIGZpbmQgdGhlIG1vc3QgY29tbW9uIHdvcmRzIHRoYXQgY29tZSBhZnRlciB3b3JkcyBvZiB5b3VyIGNob2ljZSwgbGlrZSBpL3lvdSBvciBoZS9zaGUuCgpTdGVwIDEpIEZpbmQgdGhlIGFsYnVtIGFuZCBnZXQgdGhlIGx5cmljcywgYW5kIHVubmVzdCB0aGVtLgoKYGBge3J9CnNlYXJjaF9zb25nKCJtYW5zaW9uIikKYGBgCmBgYHtyfQpnZXRfc29uZ19tZXRhKDcyNTgyMCkKYGBgCgpgYGB7cn0KbWFuc2lvbl90cmFja3MgPC0gc2NyYXBlX3RyYWNrbGlzdCgxMTk4ODApCm1hbnNpb25fdHJhY2tzCmBgYAoKYGBge3J9Cm1hbnNpb25fbHlyaWNzIDwtIG1hcF9kZihtYW5zaW9uX3RyYWNrcyRzb25nX2x5cmljc191cmwsIHNjcmFwZV9seXJpY3NfdXJsKQptYW5zaW9uX2x5cmljcwoKYGBgCgpgYGB7cn0KbWFuc2lvbl93b3JkcyA8LSBtYW5zaW9uX2x5cmljcyAlPiUKICB1bm5lc3RfdG9rZW5zKHdvcmQsIGxpbmUpICU+JSAKICBzZWxlY3Qoc29uZ19uYW1lLCB3b3JkKQoKbWFuc2lvbl93b3JkcwpgYGAKClN0ZXAgMikgQ2xlYW4gdGhlIGx5cmljcyBieSByZW1vdmluZyBzdG9wd29yZHMsIGFuZCB0aGVuIGNyZWF0ZSBhIHRhYmxlIGFuZCB3b3JkIGNsb3VkIHdpdGggdGhlIHdvcmRzIGNvdW50cy4KCmBgYHtyfQptYW5zaW9uX3dvcmRzICU+JSAKICBhbnRpX2pvaW4oZ2V0X3N0b3B3b3JkcygpKSAlPiUgCiAgY291bnQod29yZCwgc29ydCA9IFQpCmBgYAoKYGBge3J9Cm1hbnNpb25fd29yZHMgJT4lIAogIGFudGlfam9pbihnZXRfc3RvcHdvcmRzKCkpICU+JSAKICBjb3VudCh3b3JkLCBzb3J0ID0gVCkgJT4lIAogIHRvcF9uKDIwMCkgJT4lIAogIHdvcmRjbG91ZDIoc2l6ZSA9IC41KQpgYGAKClN0ZXAgMy4gRG8gc2VudGltZW50IGFuYWx5c2VzIHVzaW5nIGJpbmcgYW5kIG5yYywgYW5kIGNyZWF0ZSBncmFwaHMgb2YgdGhlIHdvcmRzIHRoYXQgY29udHJpYnV0ZSBtb3N0IHRvIGVhY2ggc2VudGltZW50LiAgCgpgYGB7cn0KYmluZyA8LSBnZXRfc2VudGltZW50cygiYmluZyIpCgptYW5zaW9uX3dvcmRzICU+JSAKICBpbm5lcl9qb2luKGJpbmcpICU+JSAKICBjb3VudCh3b3JkLCBzZW50aW1lbnQsIHNvcnQgPSBUUlVFKQoKYGBgCgpgYGB7cn0KbWFuc2lvbl93b3JkcyAlPiUgCiAgaW5uZXJfam9pbihiaW5nKSAlPiUgCiAgY291bnQod29yZCwgc2VudGltZW50LCBzb3J0ID0gVFJVRSkgJT4lIAogIGdyb3VwX2J5KHNlbnRpbWVudCkgJT4lCiAgdG9wX24oMTApICU+JQogIHVuZ3JvdXAoKSAlPiUKICBtdXRhdGUod29yZCA9IHJlb3JkZXIod29yZCwgbikpICU+JQogIGdncGxvdChhZXMod29yZCwgbiwgZmlsbCA9IHNlbnRpbWVudCkpICsKICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArCiAgZmFjZXRfd3JhcCh2YXJzKHNlbnRpbWVudCksIHNjYWxlcyA9ICJmcmVlIikgKwogIGxhYnMoeSA9ICJORiBtYW5zaW9uIGFsYnVtOiBXb3JkcyB0aGF0IGNvbnRyaWJ1dGUgdGhlIG1vc3QgdG8gZWFjaCBzZW50aW1lbnQiLAogICAgICAgeCA9IE5VTEwpICsKICBzY2FsZV9maWxsX3ZpcmlkaXNfZCgpICsKICBjb29yZF9mbGlwKCkgKwogIHRoZW1lX21pbmltYWwoKQpgYGAKCmBgYHtyfQpucmMgPC0gZ2V0X3NlbnRpbWVudHMoIm5yYyIpCm1hbnNpb25fd29yZHMgJT4lIAogIGlubmVyX2pvaW4obnJjKSAlPiUgCiAgY291bnQod29yZCwgc2VudGltZW50LCBzb3J0ID0gVFJVRSkgJT4lIAogIGdyb3VwX2J5KHNlbnRpbWVudCkgJT4lCiAgdG9wX24oMykgJT4lCiAgdW5ncm91cCgpICU+JQogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lCiAgZ2dwbG90KGFlcyh3b3JkLCBuLCBmaWxsID0gc2VudGltZW50KSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBmYWNldF93cmFwKHZhcnMoc2VudGltZW50KSwgc2NhbGVzID0gImZyZWUiKSArCiAgbGFicyh5ID0gIk5GJ3MgbWFuc2lvbiBhbGJ1bTogV29yZHMgdGhhdCBjb250cmlidXRlIHRoZSBtb3N0IHRvIGVhY2ggc2VudGltZW50IiwKICAgICAgIHggPSBOVUxMKSArCiAgc2NhbGVfZmlsbF92aXJpZGlzX2QoKSArCiAgY29vcmRfZmxpcCgpICsKICB0aGVtZV9taW5pbWFsKCkKYGBgCgpTdGVwIDQuIDQuIENyZWF0ZSBiaWdyYW1zIG9mIHRoZSBseXJpY3MsIHJlbW92ZSB0aGUgc3RvcHdvcmRzLCBhbmQgY3JlYXRlIGEgdGFibGUgYW5kIHdvcmQgY2xvdWQgb2YgdGhlIG1vc3QgY29tbW9uIGJpZ3JhbXMuCgpgYGB7cn0KbWFuc2lvbl9seXJpY3MgJT4lCiAgdW5uZXN0X3Rva2VucyhiaWdyYW0sIGxpbmUsIHRva2VuID0gIm5ncmFtcyIsIG4gPSAyKSAlPiUgCiAgc2VsZWN0KGJpZ3JhbSkgLT4gbWFuc2lvbl9iaWdyYW1zCm1hbnNpb25fYmlncmFtcyAlPiUgCiAgc2VwYXJhdGUoYmlncmFtLCBjKCJ3b3JkMSIsICJ3b3JkMiIpLCBzZXAgPSAiICIpICU+JSAKICBmaWx0ZXIoIXdvcmQxICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUKICBmaWx0ZXIoIXdvcmQyICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUgCiAgdW5pdGUoYmlncmFtLCB3b3JkMSwgd29yZDIsIHNlcCA9ICIgIikgJT4lIAogIGNvdW50KGJpZ3JhbSwgc29ydCA9IFQpCgpgYGAKYGBge3J9Cm1hbnNpb25fYmlncmFtcyAlPiUgCiAgc2VwYXJhdGUoYmlncmFtLCBjKCJ3b3JkMSIsICJ3b3JkMiIpLCBzZXAgPSAiICIpICU+JSAKICBmaWx0ZXIoIXdvcmQxICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUKICBmaWx0ZXIoIXdvcmQyICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUgCiAgdW5pdGUoYmlncmFtLCB3b3JkMSwgd29yZDIsIHNlcCA9ICIgIikgJT4lIAogIGNvdW50KGJpZ3JhbSwgc29ydCA9IFQpICU+JSAKICBmaWx0ZXIobiA+IDEpICU+JSAKICB3b3JkY2xvdWQyKHNpemUgPSAxKQpgYGAKClN0ZXAgNS4gVXNlIHRoZSBiaWdyYW0gbWV0aG9kIHRvIGZpbmQgdGhlIG1vc3QgY29tbW9uIHdvcmRzIHRoYXQgY29tZSBhZnRlciB3b3JkcyBvZiB5b3VyIGNob2ljZSwgbGlrZSBpL3lvdSBvciBoZS9zaGUuCgpgYGB7cn0KZmlyc3Rfd29yZCA8LSBjKCJpIiwgInlvdSIpICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgdGhlc2UgbmVlZCB0byBiZSBsb3dlcmNhc2UKCm1hbnNpb25fYmlncmFtcyAlPiUgCiAgY291bnQoYmlncmFtLCBzb3J0ID0gVCkgJT4lIAogIHNlcGFyYXRlKGJpZ3JhbSwgYygid29yZDEiLCAid29yZDIiKSwgc2VwID0gIiAiKSAlPiUgICAgICAgIyBzZXBhcmF0ZSB0aGUgdHdvIHdvcmRzCiAgZmlsdGVyKHdvcmQxICVpbiUgZmlyc3Rfd29yZCkgJT4lICAgICAgICAgICAgICAgICAgICAgICAgICAjIGZpbmQgZmlyc3Qgd29yZHMgZnJvbSBvdXIgbGlzdAogIGNvdW50KHdvcmQxLCB3b3JkMiwgd3QgPSBuLCBzb3J0ID0gVFJVRSkgJT4lIAogIHJlbmFtZSh0b3RhbCA9IG5uKSAlPiUgCgoKICBtdXRhdGUod29yZDIgPSBmYWN0b3Iod29yZDIsIGxldmVscyA9IHJldih1bmlxdWUod29yZDIpKSkpICU+JSAgICAgIyBwdXQgdGhlIHdvcmRzIGluIG9yZGVyCiAgZ3JvdXBfYnkod29yZDEpICU+JSAKICB0b3BfbigzKSAlPiUgCiAgZ2dwbG90KGFlcyh3b3JkMiwgdG90YWwsIGZpbGwgPSB3b3JkMSkpICsgICAgICAgICAgICAgICAgICAgICAgICAgICMKICBzY2FsZV9maWxsX3ZpcmlkaXNfZCgpICsgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBzZXQgdGhlIGNvbG9yIHBhbGV0dGUKICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArCiAgbGFicyh4ID0gTlVMTCwgeSA9IE5VTEwsIHRpdGxlID0gIldvcmQgZm9sbG93aW5nOiIpICsKICBmYWNldF93cmFwKH53b3JkMSwgc2NhbGVzID0gImZyZWUiKSArCiAgY29vcmRfZmxpcCgpICsKICB0aGVtZV9taW5pbWFsKCkKCmBgYAoKCgoK