Libraries

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## ── Attaching packages ──────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Getting lyrics from artist/album works

album <- genius_album(artist="Queen", album="A Night At the Opera")
## Joining, by = c("track_title", "track_n", "track_url")

Summarise the data loaded

summary(album)
##  track_title           track_n            line           lyric          
##  Length:407         Min.   : 1.000   Min.   :  1.00   Length:407        
##  Class :character   1st Qu.: 4.000   1st Qu.: 10.00   Class :character  
##  Mode  :character   Median : 7.000   Median : 19.00   Mode  :character  
##                     Mean   : 6.619   Mean   : 25.21                     
##                     3rd Qu.: 8.000   3rd Qu.: 32.00                     
##                     Max.   :12.000   Max.   :100.00
unique(album$track_title) #12 track titles 
##  [1] "Death on Two Legs (Dedicated to...)"
##  [2] "Lazing on a Sunday Afternoon"       
##  [3] "I’m in Love with My Car"            
##  [4] "You're My Best Friend"              
##  [5] "'39"                                
##  [6] "Sweet Lady"                         
##  [7] "Seaside Rendezvous"                 
##  [8] "The Prophet's Song"                 
##  [9] "Love of My Life"                    
## [10] "Good Company"                       
## [11] "Bohemian Rhapsody"                  
## [12] "God Save the Queen"

Clean the data

removeSpecialCharacters <- function(x) gsub("[^a-zA-Z0-9 ]", " ", x)
album$lyric <- sapply(album$lyric, removeSpecialCharacters)
album$lyric <- sapply(album$lyric, tolower)

Tokenize

tidy_lyrics <- album %>%
  group_by(track_title) %>%
  mutate(linenumber = row_number()) %>% 
  ungroup() %>%
  unnest_tokens(word, lyric)
tidy_lyrics
## # A tibble: 2,860 x 5
##    track_title                         track_n  line linenumber word 
##    <chr>                                 <int> <int>      <int> <chr>
##  1 Death on Two Legs (Dedicated to...)       1     1          1 ahh  
##  2 Death on Two Legs (Dedicated to...)       1     2          2 you  
##  3 Death on Two Legs (Dedicated to...)       1     2          2 suck 
##  4 Death on Two Legs (Dedicated to...)       1     2          2 my   
##  5 Death on Two Legs (Dedicated to...)       1     2          2 blood
##  6 Death on Two Legs (Dedicated to...)       1     2          2 like 
##  7 Death on Two Legs (Dedicated to...)       1     2          2 a    
##  8 Death on Two Legs (Dedicated to...)       1     2          2 leech
##  9 Death on Two Legs (Dedicated to...)       1     2          2 you  
## 10 Death on Two Legs (Dedicated to...)       1     2          2 break
## # … with 2,850 more rows

“Easy” and “Poor” are the most frequent words appear in Bohemian Rhapshody.

tidy_lyrics %>% 
  filter(track_title == "Bohemian Rhapsody") %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sort=T)
## Joining, by = "word"
## # A tibble: 13 x 2
##    word            n
##    <chr>       <int>
##  1 easy            4
##  2 poor            4
##  3 die             2
##  4 aching          1
##  5 cry             1
##  6 dead            1
##  7 devil           1
##  8 frightening     1
##  9 killed          1
## 10 love            1
## 11 loves           1
## 12 monstrosity     1
## 13 right           1

Find out the sentiment

Count the number of positive and negative words, in order to calculate the sentiment for each track title.

album_sentiment <- tidy_lyrics %>%
  inner_join(get_sentiments("bing")) %>%
  count(track_title, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
album_sentiment
## # A tibble: 11 x 4
##    track_title                         negative positive sentiment
##    <chr>                                  <dbl>    <dbl>     <dbl>
##  1 '39                                        3        5         2
##  2 Bohemian Rhapsody                         13        7        -6
##  3 Death on Two Legs (Dedicated to...)       18       14        -4
##  4 Good Company                               3       14        11
##  5 I’m in Love with My Car                    1       10         9
##  6 Lazing on a Sunday Afternoon               0        1         1
##  7 Love of My Life                            4        9         5
##  8 Seaside Rendezvous                         2       13        11
##  9 Sweet Lady                                 2       25        23
## 10 The Prophet's Song                        32       17       -15
## 11 You're My Best Friend                      3       15        12

Visualize the sentiments

album_sentiment %>% 
  ggplot(., aes(x=track_title, y=sentiment, fill=track_title))+
  geom_col(show.legend = F)+
  coord_flip()