Loading the required libraries first

library(geniusr)
library(tidyverse)
library(tidytext)
library(textdata)
library(ggplot2)
library(dplyr)
library(wordcloud)
library(RColorBrewer)
library(reshape2)
library(stringr)

Foster the People’s Lyrics Sentiment Analysis

This project is a sentiment analysis of the songs of the indie pop band, Foster The People. I am collectively analyzing the sentiments of the the songs within their various albums. The inspiration of this code is taken from class tutorial and Tom McNamara’s Tutorial.

Getting the artist ID, song title and lyrics

# Find artist ID
search_artist("Foster The People") # 703
## # A tibble: 1 × 3
##   artist_id artist_name       artist_url                                  
##       <int> <chr>             <chr>                                       
## 1       703 Foster the People https://genius.com/artists/Foster-the-people
songs <- get_artist_songs_df(703) 

# Get all song IDs
ids <- c(as.character(songs$song_id))

# Create empty dataframe to house them
allLyrics <- data.frame()

# Add lyrics to that df
#for (id in ids) {
  #allLyrics <- rbind(get_lyrics_id(id), allLyrics)
#}
# This loop behaves strange

The above loop behaves strangely and returns lyrics to only some songs and also incomeplte lyrics, so there’s another way to fix it. Here I’m using the tryCatch() function.

while (length(ids) > 0) {
  for (id in ids) {
    tryCatch({
      allLyrics <- rbind(get_lyrics_id(id), allLyrics)
      successful <- unique(allLyrics$song_id)
      ids <- ids[!ids %in% successful]
      print(paste("done - ", id))
      print(paste("New length is ", length(ids)))
    }, error = function(e){})
  }
}
## [1] "done -  351472"
## [1] "New length is  81"
## [1] "done -  2915122"
## [1] "New length is  80"
## [1] "done -  380374"
## [1] "New length is  79"
## [1] "done -  377697"
## [1] "New length is  78"
## [1] "done -  356833"
## [1] "New length is  77"
## [1] "done -  2915131"
## [1] "New length is  76"
## [1] "done -  3364211"
## [1] "New length is  75"
## [1] "done -  489310"
## [1] "New length is  74"
## [1] "done -  6163946"
## [1] "New length is  73"
## [1] "done -  64214"
## [1] "New length is  72"
## [1] "done -  2258730"
## [1] "New length is  71"
## [1] "done -  2128330"
## [1] "New length is  70"
## [1] "done -  452086"
## [1] "New length is  69"
## [1] "done -  187587"
## [1] "New length is  68"
## [1] "done -  2915136"
## [1] "New length is  67"
## [1] "done -  4087035"
## [1] "New length is  66"
## [1] "done -  3067104"
## [1] "New length is  65"
## [1] "done -  64221"
## [1] "New length is  64"
## [1] "done -  1933855"
## [1] "New length is  63"
## [1] "done -  1982413"
## [1] "New length is  62"
## [1] "done -  6340738"
## [1] "New length is  61"
## [1] "done -  187616"
## [1] "New length is  60"
## [1] "done -  352103"
## [1] "New length is  59"
## [1] "done -  3121066"
## [1] "New length is  58"
## [1] "done -  64223"
## [1] "New length is  57"
## [1] "done -  2093740"
## [1] "New length is  56"
## [1] "done -  3292371"
## [1] "New length is  55"
## [1] "done -  3083173"
## [1] "New length is  54"
## [1] "done -  64216"
## [1] "New length is  53"
## [1] "done -  1821707"
## [1] "New length is  52"
## [1] "done -  3121069"
## [1] "New length is  51"
## [1] "done -  3121055"
## [1] "New length is  50"
## [1] "done -  5385028"
## [1] "New length is  49"
## [1] "done -  64215"
## [1] "New length is  48"
## [1] "done -  5584386"
## [1] "New length is  47"
## [1] "done -  64217"
## [1] "New length is  46"
## [1] "done -  3067109"
## [1] "New length is  45"
## [1] "done -  482096"
## [1] "New length is  44"
## [1] "done -  3121065"
## [1] "New length is  43"
## [1] "done -  1573181"
## [1] "New length is  42"
## [1] "done -  64218"
## [1] "New length is  41"
## [1] "done -  379762"
## [1] "New length is  40"
## [1] "done -  2915876"
## [1] "New length is  39"
## [1] "done -  3121057"
## [1] "New length is  38"
## [1] "done -  3067103"
## [1] "New length is  37"
## [1] "done -  3374644"
## [1] "New length is  36"
## [1] "done -  4842185"
## [1] "New length is  35"
## [1] "done -  356823"
## [1] "New length is  34"
## [1] "done -  2915133"
## [1] "New length is  33"
## [1] "done -  1429"
## [1] "New length is  32"
## [1] "done -  2860591"
## [1] "New length is  31"
## [1] "done -  2255415"
## [1] "New length is  30"
## [1] "done -  2257844"
## [1] "New length is  29"
## [1] "done -  460874"
## [1] "New length is  28"
## [1] "done -  3067107"
## [1] "New length is  27"
## [1] "done -  3103488"
## [1] "New length is  26"
## [1] "done -  3403256"
## [1] "New length is  25"
## [1] "done -  3403245"
## [1] "New length is  24"
## [1] "done -  3599670"
## [1] "New length is  23"
## [1] "done -  3121058"
## [1] "New length is  22"
## [1] "done -  4409461"
## [1] "New length is  21"
## [1] "done -  2016764"
## [1] "New length is  20"
## [1] "done -  2146715"
## [1] "New length is  19"
## [1] "done -  380210"
## [1] "New length is  18"
## [1] "done -  489311"
## [1] "New length is  17"
## [1] "done -  5646231"
## [1] "New length is  16"
## [1] "done -  379776"
## [1] "New length is  15"
## [1] "done -  2915139"
## [1] "New length is  14"
## [1] "done -  2387275"
## [1] "New length is  13"
## [1] "done -  3121071"
## [1] "New length is  12"
## [1] "done -  6159351"
## [1] "New length is  11"
## [1] "done -  6163945"
## [1] "New length is  10"
## [1] "done -  64222"
## [1] "New length is  9"
## [1] "done -  64219"
## [1] "New length is  8"
## [1] "done -  7501188"
## [1] "New length is  7"
## [1] "done -  4069722"
## [1] "New length is  6"
## [1] "done -  6163947"
## [1] "New length is  5"
## [1] "done -  2016159"
## [1] "New length is  4"
## [1] "done -  2898013"
## [1] "New length is  3"
## [1] "done -  4637732"
## [1] "New length is  2"
## [1] "done -  7065584"
## [1] "New length is  1"
## [1] "done -  3089129"
## [1] "New length is  0"

Creating a dataframe containing the song IDs and their respective albums

allIds <- data.frame(song_id = unique(allLyrics$song_id))
allIds$album <- ""

for (song in allIds$song_id) {
  allIds[match(song,allIds$song_id),2] <- get_song_df(song)[12]
  print(allIds[match(song,allIds$song_id),])
}
##   song_id album
## 1 3089129  <NA>
##   song_id                      album
## 2 7065584 Torches X (Deluxe Edition)
##   song_id album
## 3 4637732  <NA>
##   song_id                      album
## 4 2898013 Torches X (Deluxe Edition)
##   song_id      album
## 5 2016159 Supermodel
##   song_id                                        album
## 6 6163947 In the Darkest of Nights, Let the Birds Sing
##   song_id album
## 7 4069722  <NA>
##   song_id album
## 8 7501188  <NA>
##   song_id   album
## 9   64219 Torches
##    song_id   album
## 10   64222 Torches
##    song_id                                        album
## 11 6163945 In the Darkest of Nights, Let the Birds Sing
##    song_id                                        album
## 12 6159351 In the Darkest of Nights, Let the Birds Sing
##    song_id              album
## 13 3121071 Sacred Hearts Club
##    song_id      album
## 14 2387275 Supermodel
##    song_id                                    album
## 15 2915139 Spotify Sessions (Live from The Village)
##    song_id      album
## 16  379776 Supermodel
##    song_id                                        album
## 17 5646231 In the Darkest of Nights, Let the Birds Sing
##    song_id      album
## 18  489311 Supermodel
##    song_id      album
## 19  380210 Supermodel
##    song_id      album
## 20 2146715 Supermodel
##    song_id      album
## 21 2016764 Supermodel
##    song_id album
## 22 4409461  <NA>
##    song_id              album
## 23 3121058 Sacred Hearts Club
##    song_id album
## 24 3599670  <NA>
##    song_id                     album
## 25 3403245 Sit Next to Me (Versions)
##    song_id                     album
## 26 3403256 Sit Next to Me (Versions)
##    song_id              album
## 27 3103488 Sacred Hearts Club
##    song_id              album
## 28 3067107 Sacred Hearts Club
##    song_id                      album
## 29  460874 Torches X (Deluxe Edition)
##    song_id             album
## 30 2257844 Torches (Remixes)
##    song_id             album
## 31 2255415 Torches (Remixes)
##    song_id             album
## 32 2860591 Torches (Remixes)
##    song_id   album
## 33    1429 Torches
##    song_id                                    album
## 34 2915133 Spotify Sessions (Live from The Village)
##    song_id      album
## 35  356823 Supermodel
##    song_id album
## 36 4842185  <NA>
##    song_id album
## 37 3374644  <NA>
##    song_id              album
## 38 3067103 Sacred Hearts Club
##    song_id              album
## 39 3121057 Sacred Hearts Club
##    song_id                                    album
## 40 2915876 Spotify Sessions (Live from The Village)
##    song_id      album
## 41  379762 Supermodel
##    song_id   album
## 42   64218 Torches
##    song_id album
## 43 1573181  <NA>
##    song_id              album
## 44 3121065 Sacred Hearts Club
##    song_id                      album
## 45  482096 Torches X (Deluxe Edition)
##    song_id              album
## 46 3067109 Sacred Hearts Club
##    song_id   album
## 47   64217 Torches
##    song_id                                        album
## 48 5584386 In the Darkest of Nights, Let the Birds Sing
##    song_id   album
## 49   64215 Torches
##    song_id album
## 50 5385028  <NA>
##    song_id              album
## 51 3121055 Sacred Hearts Club
##    song_id              album
## 52 3121069 Sacred Hearts Club
##    song_id                      album
## 53 1821707 Torches X (Deluxe Edition)
##    song_id   album
## 54   64216 Torches
##    song_id album
## 55 3083173  <NA>
##    song_id album
## 56 3292371  <NA>
##    song_id                      album
## 57 2093740 Torches X (Deluxe Edition)
##    song_id   album
## 58   64223 Torches
##    song_id              album
## 59 3121066 Sacred Hearts Club
##    song_id      album
## 60  352103 Supermodel
##    song_id      album
## 61  187616 Supermodel
##    song_id album
## 62 6340738  <NA>
##    song_id                                     album
## 63 1982413 Don’t Stop (Color on the Walls) [Remixes]
##    song_id                                     album
## 64 1933855 Don’t Stop (Color on the Walls) [Remixes]
##    song_id   album
## 65   64221 Torches
##    song_id              album
## 66 3067104 Sacred Hearts Club
##    song_id album
## 67 4087035  <NA>
##    song_id                                    album
## 68 2915136 Spotify Sessions (Live from The Village)
##    song_id      album
## 69  187587 Supermodel
##    song_id                      album
## 70  452086 Torches X (Deluxe Edition)
##    song_id                      album
## 71 2128330 Torches X (Deluxe Edition)
##    song_id             album
## 72 2258730 Torches (Remixes)
##    song_id   album
## 73   64214 Torches
##    song_id                                        album
## 74 6163946 In the Darkest of Nights, Let the Birds Sing
##    song_id                      album
## 75  489310 Torches X (Deluxe Edition)
##    song_id album
## 76 3364211  <NA>
##    song_id                                    album
## 77 2915131 Spotify Sessions (Live from The Village)
##    song_id      album
## 78  356833 Supermodel
##    song_id      album
## 79  377697 Supermodel
##    song_id      album
## 80  380374 Supermodel
##    song_id                                    album
## 81 2915122 Spotify Sessions (Live from The Village)
##    song_id      album
## 82  351472 Supermodel
allLyrics <- full_join(allIds, allLyrics)
## Joining, by = "song_id"
head(allIds)
##   song_id                                        album
## 1 3089129                                         <NA>
## 2 7065584                   Torches X (Deluxe Edition)
## 3 4637732                                         <NA>
## 4 2898013                   Torches X (Deluxe Edition)
## 5 2016159                                   Supermodel
## 6 6163947 In the Darkest of Nights, Let the Birds Sing

Here, we can see there are some songs that are not associated with an album. This means that they were relased as singles and Genius has not assigned them to an album. So, the code here replaces the NAs with “Single Only”.

allIds$album[is.na(allIds$album)] <- "Single Only"
head(allIds)
##   song_id                                        album
## 1 3089129                                  Single Only
## 2 7065584                   Torches X (Deluxe Edition)
## 3 4637732                                  Single Only
## 4 2898013                   Torches X (Deluxe Edition)
## 5 2016159                                   Supermodel
## 6 6163947 In the Darkest of Nights, Let the Birds Sing
allLyrics2 <- full_join(allLyrics, allIds)
## Joining, by = c("song_id", "album")

Tokenzing the words

allLyricsTokenised <- allLyrics2 %>%
  unnest_tokens(word, line)

Looking at the most common word

head(allLyricsTokenised %>%
  count(word, sort = TRUE))
##   word    n
## 1  you 1026
## 2  the  988
## 3    i  738
## 4   to  591
## 5  and  590
## 6 yeah  414

Interestingly, songs by Foster the People have “you” as the most common word!

There are a lot of stopwords, so the code below removes these.

# Remove stopwords
tidyLyrics <- allLyricsTokenised %>%
  anti_join(stop_words)
## Joining, by = "word"
# Top words again
head(tidyLyrics %>%
  count(word, sort = TRUE))
##   word   n
## 1 yeah 414
## 2  run 241
## 3 stop 201
## 4  ooh 172
## 5 love 129
## 6  doo  84

Now, the most common word is “yeah”, followed by “run”, “stop”, “ooh”, “love”. I guess, “you” held some weight here!!

Visualizing Top Lyrics

topFew <- tidyLyrics %>%
  group_by(album, word) %>%
  mutate(n = row_number()) %>%
  ungroup()

Removing extra columns from the dataframe

topFew <- topFew[,c("album", "word", "n")]

# Taking only max for each word by album
topFew <- topFew %>%
  group_by(album, word) %>%
  summarise(n = max(n))%>%
  ungroup()
## `summarise()` has grouped output by 'album'. You can override using the
## `.groups` argument.

Adding the columns and creating a subset with words that appear atleast 40 times. Also removing the word “ooh”!

# Subset
topFew <- topFew %>% 
  group_by(word) %>%
  mutate(total = sum(n)) %>%
  filter(total >= 40,
         word != "ooh") %>%
  ungroup()

Assigning colors for each album that will show up in the graph.

albumCol <- c("#394887",      # DS(CW)Remixes
               "#9e5a47",      # Darkest of Nights
               "#f9c784",      # Sacred Hearts Club
               "#cf57d4",      # Sit Next to Me (Versions)
               "#e8b0a5",      # Spotify Sessions
               "#d18943",      # Supermodel
               "#4C1A57",      # Torches
               "#52BA4A",      # Torches (remix)
               "#7268CC",      # Torches X (delux)
               "#5BBFCF")      # Singles

names(albumCol) <- c("Don't Stop (Colors on the Walls) [Remixes]", "In the Darkest of Nights, Let the Birds Sing",
                      "Sacred Hearts Club", "Sit Next to Me (Versions)", "Spotify Sessions (Live from The Village)", "Supermodel",
                      "Torches", "Torches (Remixes)", "Torches X (Deluxe Edition)", "NA")
 
# This ensures bars are stacked in order of release date
topFew$album <- factor(topFew$album, levels = c("Torches",
                                                 "Torches (Remixes)",
                                                 "Don't Stop (Colors on the Walls) [Remixes]", 
                                                 "Spotify Sessions (Live from The Village)", 
                                                 "Supermodel", 
                                                 "Sit Next to Me (Versions)",
                                                 "In the Darkest of Nights, Let the Birds Sing", "Torches X (Deluxe Edition)", "NA"
 ))

Plot for most used words

wordsPlot <- ggplot(topFew) +
     
     geom_bar(aes(x = reorder(word, total), 
                  y = n,
                  fill = as.factor(album)),
              colour = "black",
              stat = "identity") +
     
     coord_flip() +
     
     labs(title = "Foster The People's most used words",
          subtitle = "The words that appear more than 40 times in Foster The People's catalogue",
          caption = "Source: genius.com",
          y = "Number of appearances",
          x = "Word",
          fill = "Album")+
     
     scale_fill_manual(values = albumCol) +
     
     theme(title = element_text(face = "italic", size = 10), 
           
           panel.border = element_rect(colour = "black", fill=NA, size=1),
           panel.background = element_rect(colour = "black", fill = "white"),
           panel.grid.major.x = element_line(colour="grey90",size = 0.1, linetype = 1),
           
           axis.title = element_text(face = "italic",size = 9, colour = "black"),
           axis.ticks.length = unit(5, units = "pt"),
           
           legend.background = NULL,
           legend.position = "top",
           legend.key.size = unit(8,"pt"),
           legend.box.spacing = unit(5,"pt"),
           legend.text = element_text(size = 8),
           
           axis.text.y = element_text(size = 8))

wordsPlot

Sentiment Analysis on Some of the Albums Using “bing”

Sentiment Plot

# Create Sentiment dataframe
ftpsentiments <- tidyLyrics %>%
  inner_join(get_sentiments("bing"))%>% 
  count(album, song_name, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
# Factor as we did above
ftpsentiments$album <- factor(ftpsentiments$album, 
                               levels = c("Torches",
                                          "Don't Stop (Colors on the Walls) [Remixes]", 
                                          "Spotify Sessions (Live from The Village)", 
                                          "Supermodel", 
                                          "In the Darkest of Nights, Let the Birds Sing"))


# sent plot
sentPlot <- ggplot(ftpsentiments,
                   aes(reorder(song_name, 
                               sentiment), 
                       sentiment, 
                       fill = album)) +
  
  geom_col(show.legend = FALSE) +
  
  facet_wrap(~album, 
             ncol = 3, 
             scales = "free")+
  
  scale_fill_manual(values = albumCol)+
  
  labs(title = "Foster The People's songs ranked by sentiment",
       caption = "Source: genius.com",
       y = "Sentiment score",
       fill = "Album")+
  
  theme(title = element_text(face = "italic", size = 10), 
      
      panel.border = element_rect(colour = "black", fill=NA, size=1),
      panel.background = element_rect(colour = "black", fill = "white"),
      panel.grid.major.x = element_line(colour="grey90",size = 0.1, linetype = 1),
      
      axis.title.x = element_text(face = "italic",size = 8, colour = "black"),
      axis.title.y = element_blank(),
      axis.ticks.length = unit(5, units = "pt"),
      
      legend.background = NULL,
      legend.position = "top",
      legend.key.size = unit(8,"pt"),
      legend.box.spacing = unit(5,"pt")) +
  
  coord_flip()

sentPlot

Evaluating Positive and Negative Words

This section evaluates the frequency of positive and negative words in the lyrics.

bing_word_counts <- tidyLyrics %>%
     inner_join(get_sentiments("bing")) %>%
     count(word, sentiment, sort = TRUE) %>%
     ungroup()
## Joining, by = "word"
bing_word_counts %>%
     group_by(sentiment) %>%
     top_n(10) %>%
     ungroup() %>%
     mutate(word = reorder(word, n)) %>%
     ggplot(aes(word, n, fill = sentiment)) +
     geom_col(show.legend = FALSE) +
     facet_wrap(~sentiment, scales = "free_y") +
     labs(y = "Contribution to sentiment",
          x = NULL) +
     coord_flip()
## Selecting by n

Wordcloud

Creating a simple wordcloud out of the tokenized words.

tidyLyrics %>%
     anti_join(stop_words) %>%
     count(word) %>%
     with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

Creating a stylizeed wordcloud depicting postive emotion words in pink and negative emotion words in gray.

tidyLyrics %>%
     inner_join(get_sentiments("bing")) %>%
     count(word, sentiment, sort = TRUE) %>%
     acast(word ~ sentiment, value.var = "n", fill = 0) %>%
     comparison.cloud(colors = c("gray40", "pink"),
                      max.words = 100)
## Joining, by = "word"