Instructions:

Install and load all the packages required for this workshop all at once with this code:

if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(
  tidyverse, #data manipulation and visualizations
  gridExtra, #viewing multiple plots together
  tidytext, #text mining
  wordcloud2, #creative visualizations
  ggrepel, #`geom_label_repel`
  gridExtra, #`grid.arrange()` for multi-graphs
  knitr, #Create nicely formatted output tables
  formattable, #For the color_tile function
  circlize, #Visualizations - chord diagram
  memery, #Memes - images with plots
  magick, #Memes - images with plots (image_read)
  yarrr, #Pirate plot
  radarchart, #Visualizations
  igraph,
  ggraph #ngram network diagrams
)

You can also install and load the packages individually from the list below.

Load packages for preliminary tasks

library(tidyverse) #data manipulation and visualizations
library(gridExtra) #viewing multiple plots together
library(tidytext) #text mining
library(wordcloud2) #creative visualizations

Load packages for visualizations!

library(ggplot2) #Visualizations (also included in the tidyverse package)
library(ggrepel) #`geom_label_repel`
library(gridExtra) #`grid.arrange()` for multi-graphs
library(knitr) #Create nicely formatted output tables
library(formattable) #For the color_tile function
library(circlize) #Visualizations - chord diagram
library(memery) #Memes - images with plots
library(magick) #Memes - images with plots (image_read)
library(yarrr)  #Pirate plot
library(radarchart) #Visualizations
library(igraph) #ngram network diagrams
library(ggraph) #ngram network diagrams

The task at hand:

Conduct a sentiment analysis on the song lyrics of Evanescence using data from Genius.

What is Sentiment Analysis? It is a form of text classification that helps us analyze the underlying sentiment in messages.

Phases of this Workshop

-PART 1 - Data Cleaning / Text Mining

-PART 2 - Sentiment Analysis

PART 1 - Data Cleaning / Text Mining

Step 1- Data Inspection

Load dataset.

library(readr)
eva <- read.csv('https://raw.githubusercontent.com/karendelarosa/datasets/master/eva_lyrics.csv')

See the number of cases, variable names, and type of variables.

str(eva)
## 'data.frame':    2245 obs. of  8 variables:
##  $ X         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ artist    : chr  "Evanescence" "Evanescence" "Evanescence" "Evanescence" ...
##  $ album     : chr  "Origin" "Origin" "Origin" "Origin" ...
##  $ year      : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ track_n   : int  1 1 1 1 1 1 2 2 2 2 ...
##  $ verse_line: int  1 2 3 4 5 6 1 2 3 4 ...
##  $ lyrics    : chr  "(I'm sure it's getting darker" "Don't close your eyes" "Don't make a sound" "Don't make a sound)" ...
##  $ song      : chr  "Origin" "Origin" "Origin" "Origin" ...

Looks like R is reading the album column as character. In order to make comparisons between albums on this analysis, we should convert album to factor.

Make the conversion.

eva$album <- factor(eva$album)

Check the factors on the album variable with the table function.

table(eva$album)
## 
##      Evanescence           Fallen           Origin The Bitter Truth 
##              613              454              300              423 
##    The Open Door 
##              455

The Evanescence albums featured on this dataset are Evanescence, Fallen, Origin, The Bitter Truth and the Open Door.

Everything looks good thus far.

Step 2 - Clean dataset

Remove linguistic contractions. Use function() to expand contractions in an English-language source.

Note: “won’t” is a special case as it does not expand to “wo not”.

fix.contractions <- function(doc){
  doc <- gsub("won't", "will not", doc)
  doc <- gsub("can't", "can not", doc)
  doc <- gsub("n't", " not", doc)
  doc <- gsub("'ll", " will", doc)
  doc <- gsub("'re", " are", doc)
  doc <- gsub("'ve", " have", doc)
  doc <- gsub("'m", " am", doc)
  doc <- gsub("'d", " would", doc)
  doc <- gsub("'s", "", doc)
  return(doc)
}

Fix (expand) contractions.

eva$lyrics <- sapply(eva$lyrics, fix.contractions)

Create function to remove special characters.

removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]", " ", x)

Remove special characters.

eva$lyrics <- sapply(eva$lyrics, removeSpecialChars)

Convert everything to lower case.

eva$lyrics <- sapply(eva$lyrics, tolower)

Inspect the cleaned dataset.

head(eva)
##   X      artist  album year track_n verse_line
## 1 1 Evanescence Origin 2000       1          1
## 2 2 Evanescence Origin 2000       1          2
## 3 3 Evanescence Origin 2000       1          3
## 4 4 Evanescence Origin 2000       1          4
## 5 5 Evanescence Origin 2000       1          5
## 6 6 Evanescence Origin 2000       1          6
##                                          lyrics   song
## 1                   i am sure it getting darker Origin
## 2                        do not close your eyes Origin
## 3                           do not make a sound Origin
## 4                          do not make a sound  Origin
## 5 you hold the answer deep within your own mind Origin
## 6                death in its most hideous form Origin

Create decade columns.

eva <- eva %>%
  mutate(decade=
           ifelse(eva$year %in% 2000:2009, "2000-2009",
           ifelse(eva$year %in% 2010:2021, "2010-2021",
                  "NA")))

Check the decade column.

str(eva$decade)
##  chr [1:2245] "2000-2009" "2000-2009" "2000-2009" "2000-2009" "2000-2009" ...
table(eva$decade)
## 
## 2000-2009 2010-2021 
##      1209      1036

The decade brackets look good. In this case, we only have two brackets: 2000-2009 and 2010-2021.

Visualization 1 - Evanescence songs released per decade

Define some colors for visualizations.

my_colors <- c("#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#D55E00")

theme_lyrics <- function()
{
  theme(plot.title=element_text(hjust=0.5),
        axis.text.x=element_blank(),
        axis.ticks = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position="none")
}

See how many Evanescence songs were released per decade. In order to do this, you should filter all the missing data (NAs).

#Song Stats
eva %>%
  filter(decade != "NA") %>%
  group_by(decade, track_n) %>%
  summarise(title = n()) %>%
  ggplot() + 
  geom_bar(aes(x = decade, y = track_n, 
               fill = track_n), stat = "identity")  +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid.minor = element_blank()) +
  ggtitle("Evanescence - Released Songs") +
  labs(x = NULL, y = "Song Count")
## `summarise()` has grouped output by 'decade'. You can override using the `.groups` argument.

Step 3 - Text Mining

Text mining is a form of analytics that helps us looking at trends and meanings in text data.

Remove words that do not add any meaning to underlying messages in the songs.

undesirable_words <- c("chorus", "repeat", "lyrics", 
                       "theres", "bridge", "fe0f", "yeah", "baby", 
                       "alright", "wanna", "gonna", "chorus", "verse", 
                       "whoa", "gotta", "make", "miscellaneous", "2", 
                       "4", "ooh", "uurh", "pheromone", "poompoom", "3121", 
                       "matic", " ai ", " ca ", " la ", "hey", " na ", 
                       " da ", " uh ", " tin ", "  ll", "transcription",
                       "repeats")

Stop words

Stop words are terms that may not add any meaning to the text. There are different lists to choose from, but we will use the lexicon called ‘stop_words’ from the tidytext package.

Create tidy text format. The code will remove unnested, unsummarized, and undesirable words.

# Stop and Short words
eva_tidy <- eva %>%
  unnest_tokens(word, lyrics) %>% #Break the lyrics into individual words
  filter(!word %in% undesirable_words) %>% #Remove undesirables
  filter(!nchar(word) < 3) %>% #Words like "ah" or "oo" used in music
  anti_join(stop_words) #Data provided by the tidytext package
## Joining, by = "word"
glimpse(eva_tidy) #From `dplyr`, better than `str()`.
## Rows: 3,551
## Columns: 9
## $ X          <int> 1, 2, 2, 3, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 9, 9, 10, 11, 11,…
## $ artist     <chr> "Evanescence", "Evanescence", "Evanescence", "Evanescence",…
## $ album      <fct> Origin, Origin, Origin, Origin, Origin, Origin, Origin, Ori…
## $ year       <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,…
## $ track_n    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ verse_line <int> 1, 2, 2, 3, 4, 5, 5, 5, 5, 6, 6, 6, 1, 1, 3, 3, 4, 5, 5, 6,…
## $ song       <chr> "Origin", "Origin", "Origin", "Origin", "Origin", "Origin",…
## $ decade     <chr> "2000-2009", "2000-2009", "2000-2009", "2000-2009", "2000-2…
## $ word       <chr> "darker", "close", "eyes", "sound", "sound", "hold", "answe…

Use sample() to show a random list of stop words and head() to limit to 15 words.

head(sample(stop_words$word, 15),15)
##  [1] "cant"         "thoughts"     "just"         "downing"      "wish"        
##  [6] "already"      "anywhere"     "either"       "couldn't"     "nothing"     
## [11] "etc"          "respectively" "get"          "also"         "through"

Unnest and remove stop, undesirable, and short words.

eva_filtered <- eva %>%
  unnest_tokens(word, lyrics)  %>%
  anti_join(stop_words)  %>%
  distinct()  %>%
  filter(!word %in% undesirable_words)  %>%
  filter(nchar(word)>3)
## Joining, by = "word"

Visualization 2 - Barplot Top Words

Most frequently used words in the full set of lyrics of Evanescence.

eva_filtered %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot() +
  geom_col(aes(word, n), fill = my_colors[3]) +
  theme(legend.position = "none", 
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank()) +
  xlab("") + 
  ylab("Song Count") +
  ggtitle("Most Frequently Used Words in Evanescence Lyrics") +
  coord_flip()
## Selecting by n

Visualization 3 - Word cloud

Just for creative purposes, let’s compute a word cloud with the most used words by Evanescence.

eva_words_counts <- eva_filtered %>%
  count(word, sort = TRUE) 

wordcloud2(eva_words_counts[1:300,], size=.5)

Visualization 4 - Barplot Timeless Words

Some words in music are considered to be timeless. Certain timeless words in song lyrics might be recurrent across albums from the same artist and appeal to large audiences. If we break down our analysis across decades, these words will roll to the top.

Take a look at Evanescence’s timeless words by decade.

timeless_words <- eva_filtered %>% 
  filter(decade != 'NA') %>%
  group_by(decade) %>%
  count(word, decade, sort = TRUE) %>%
  slice(seq_len(10)) %>%
  ungroup() %>%
  arrange(decade,n) %>%
  mutate(row = row_number()) 

timeless_words %>%
  ggplot(aes(row, n, fill = decade)) +
  geom_col(show.legend = NULL) +
  labs(x = NULL, y = "Song Count") +
  ggtitle("Timeless Words") + 
  theme_lyrics() +  
  facet_wrap(~decade, scales = "free", ncol = 5) +
  scale_x_continuous(  # This handles replacement of row 
    breaks = timeless_words$row, # notice need to reuse data frame
    labels = timeless_words$word) +
  coord_flip()

PART 2 - Sentiment Analysis

Explore Sentiment Lexicons

The tidytext package includes a dataset called ‘sentiments’ which contains several lexicons. These lexicons are dictionaries of words with an assigned sentiment category or value.

the tidytext package contains three lexicons:

- AFINN: Assigns words with a score that runs between -5 and 5, with negative scores indicating negative sentiment and positive scores indicating positive sentiment

- Bing: Places words into positive or negative categories.

- NRC: Assigns words into one or more of the following ten categories: positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.

We will use the Bing and NRC lexicons for our Sentiment Analysis.

library("textdata")
bing <- get_sentiments("bing") %>%
  mutate(lexicon = "bing",
         words_in_lexicon = n_distinct(word))

nrc <- get_sentiments("nrc") %>%
  mutate(lexicon = "nrc",
         words_in_lexicon = n_distinct(word))

new_sentiments <- bind_rows(sentiments, bing, nrc)

Create an object for each lexicon. Use Bing for binary valence and NRC for categorical sentiments.

eva_bing <- eva_tidy %>%
  inner_join(get_sentiments("bing"))
## Joining, by = "word"
eva_nrc <- eva_tidy %>%
  inner_join(get_sentiments("nrc"))
## Joining, by = "word"
eva_nrc_sub <- eva_tidy %>%
  inner_join(get_sentiments("nrc")) %>%
  filter(!sentiment %in% c("positive", "negative"))
## Joining, by = "word"

Visualization 5 - NRC Sentiment Analysis on the lyrical data

Graph the NRC Sentiment Analysis on the entire dataset.

nrc_plot <- eva_nrc %>%
  group_by(sentiment) %>%
  summarise(word_count = n()) %>%
  ungroup() %>%
  mutate(sentiment = reorder(sentiment, word_count)) %>%
  #Use `fill = -word_count` to make the larger bars darker
  ggplot(aes(sentiment, word_count, fill = -word_count)) +
  geom_col() +
  guides(fill = "none") + #Turn off the legend
  theme_lyrics() +
  labs(x = NULL, y = "Word Count") +
  scale_y_continuous(limits = c(0, 1000)) + #Hard code the axis limit
  ggtitle("Evanescence NRC Sentiment") +
  coord_flip()
lab <- ""  #Turn off the label
nrc_plot

Visualization 6 - Bing Sentiment Analysis of on the lyrical data

Now take a look at the Bing overall sentiment.

bing_plot <- eva_bing %>%
  group_by(sentiment) %>%
  summarise(word_count = n()) %>%
  ungroup() %>%
  mutate(sentiment = reorder(sentiment, word_count)) %>%
  ggplot(aes(sentiment, word_count, fill = sentiment)) +
  geom_col() +
  guides(fill = "none") +
  theme_lyrics() +
  labs(x = NULL, y = "Word Count") +
  scale_y_continuous(limits = c(0, 1000)) +
  ggtitle("Evanescence Bing Sentiment") +
  coord_flip()
lab1 <- ""
bing_plot

Visualizations 7 and 8 - Evanescence NRC words (Years 2000 and 2021)

Create object for Evanescence NRC words 2000.

plot_words_2000 <- eva_nrc %>%
  filter(year == "2000") %>%
  group_by(sentiment) %>%
  count(word, sort = TRUE) %>%
  arrange(desc(n)) %>%
  slice(seq_len(10)) %>%
  ungroup()

Plot NRC words for Evanescence (year 2000).

plot_words_2000 %>%
  ggplot(aes(word, 1, label = word, fill = sentiment )) +
  geom_point(color = "transparent") +
  geom_label_repel(force = 1,nudge_y = .5,  
                   direction = "y",
                   box.padding = 0.05,
                   segment.color = "transparent",
                   size = 3) +
  facet_grid(~sentiment) +
  theme_lyrics() +
  theme(axis.text.y = element_blank(), axis.text.x = element_blank(),
        axis.title.x = element_text(size = 6),
        panel.grid = element_blank(), panel.background = element_blank(),
        panel.border = element_rect("lightgray", fill = NA),
        strip.text.x = element_text(size = 9)) +
  xlab(NULL) + ylab(NULL) +
  ggtitle("Evanescence 2000 NRC Sentiment") +
  coord_flip()

Now create object for Evanescence NRC words 2021.

plot_words_2021 <- eva_nrc %>%
  filter(year == "2021") %>%
  group_by(sentiment) %>%
  count(word, sort = TRUE) %>%
  arrange(desc(n)) %>%
  slice(seq_len(10)) %>%
  ungroup()

Plot NRC words for Evanescence (year 2021).

plot_words_2021 %>%
  ggplot(aes(word, 1, label = word, fill = sentiment )) +
  geom_point(color = "transparent") +
  geom_label_repel(force = 1,nudge_y = .5,  
                   direction = "y",
                   box.padding = 0.05,
                   segment.color = "transparent",
                   size = 3) +
  facet_grid(~sentiment) +
  theme_lyrics() +
  theme(axis.text.y = element_blank(), axis.text.x = element_blank(),
        axis.title.x = element_text(size = 6),
        panel.grid = element_blank(), panel.background = element_blank(),
        panel.border = element_rect("lightgray", fill = NA),
        strip.text.x = element_text(size = 9)) +
  xlab(NULL) + ylab(NULL) +
  ggtitle("Evanescence 2021 NRC Sentiment") +
  coord_flip()

Visualization 9 - NRC sentiment categories for specific songs

Look at the themes and see if they appear to be correlated across songs and albums.

eva_nrc_sub %>%
  filter(song %in% c("Where Will You Go", "Going Under", "Sweet Sacrifice",
                     "What You Want", "Broken Pieces Shine")) %>%
  count(song, sentiment, year) %>%
  mutate(sentiment = reorder(sentiment, n), song = reorder(song, n)) %>%
  ggplot(aes(sentiment, n, fill = sentiment)) +
  geom_col() +
  facet_wrap(year ~ song, scales = "free_x", labeller = label_both) +
  theme_lyrics() +
  theme(panel.grid.major.x = element_blank(),
        axis.text.x = element_blank()) +
  labs(x = NULL, y = NULL) +
  ggtitle("NRC Sentiment Song Analysis") +
  coord_flip()

Visualization 10 - Bigrams Per Decade

So far, we have looked at unigrams or single words. Now we will look at bigrams or ‘word pairs’ for more insights on the lyrics.

Note: Sometimes single words in lyrics can be misleading.

eva_bigrams <- eva %>%
  unnest_tokens(bigram, lyrics, token = "ngrams", n = 2)

bigrams_separated <- eva_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word1 %in% undesirable_words) %>%
  filter(!word2 %in% undesirable_words)

Filter out the cases where the two words are the same to avoid overlap.

bigram_decade <- bigrams_filtered %>%
  filter(word1 != word2) %>%
  filter(decade != "NA") %>%
  unite(bigram, word1, word2, sep = " ") %>%
  inner_join(eva) %>%
  count(bigram, decade, sort = TRUE) %>%
  group_by(decade) %>%
  slice(seq_len(7)) %>%
  ungroup() %>%
  arrange(decade, n) %>%
  mutate(row = row_number())
## Joining, by = c("X", "artist", "album", "year", "track_n", "verse_line", "song", "decade")

Join by decade.

bigram_decade %>%
  ggplot(aes(row, n, fill = decade)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~decade, scales = "free_y") +
  xlab(NULL) + ylab(NULL) +
  scale_x_continuous(  # This handles replacement of row
    breaks = bigram_decade$row, # Notice need to reuse data frame
    labels = bigram_decade$bigram) +
  theme_lyrics() +
  theme(panel.grid.major.x = element_blank()) +
  ggtitle("Bigrams Per Decade") +
  coord_flip()

Conclusions and Insights:

1 - Evanescence released about 100 songs in the 2000-2009 decade, and approximately 160 songs in the 2010-2021 decade.

2 - The most frequently used words in the lyrics of Evanescence are ‘love’,‘pain’, and ‘feel’. ‘love’ is the most used word in the 2000-2009 decade. However, ‘pain’ is the most used word in the 2010-2021 decade.

3 - The messages and themes in the lyrics of Evanescence are mostly negative.

4 - The most recurring (non-binary) categories for the lyrics of Evanescence are sadness, fear, and trust.

5 - Several correlations between sentiment categories can be observed in the lyrics of Evanescence throughout the years.

We have reached the end of this workshop. Thank you for participating!