knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(cache = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8)
Okay in this markdown I wanna show you how to compare Taylor Swift and Ed Sheeran lyrics using tidytext library. The reason why I choose Taylor and Ed because they are my favorite musician. The Tidytext is also a great library for text mining besides quanteda. I have been using quanteda, but now I am trying to use new library. Both are great library for text mining
library(tidytext) # text mining
library(tidyverse) # data manipulation
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.7 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(forcats) # for reordering factor
library(scales) # automatically determining breaks and labels for axes and legends
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
I found the dataset in Kaggle. You can visit this link if you want to playing with lyrics data https://www.kaggle.com/datasets/deepshah16/song-lyrics-dataset/code
Let’s import the dataset.
# import dataset
df.taylor <- read.csv("lyrics/TaylorSwift.csv", encoding = "UTF-8",
stringsAsFactors = FALSE) # import taylor swift
df.ed <- read.csv("lyrics/EdSheeran.csv", encoding = "UTF-8",
stringsAsFactors = FALSE) # import ed sheeran
Merge two datasets into a single data frame
# merge
df <- rbind(df.taylor, df.ed) %>%
select(-X)
Because release data column is formatted as a character, I change to YYYY-mm-dd format.
# format as date year format
df$Year <- as.character(df$Year)
df$Date <- as.POSIXlt(df$Date, origin = "%Y-%m-%d", format = "%Y-%m-%d")
I change the data frame to tidy format
# tidy
td <- df %>%
tibble() %>%
group_by(Artist)
td
## # A tibble: 775 x 6
## Artist Title Album Year Date Lyric
## <chr> <chr> <chr> <chr> <dttm> <chr>
## 1 Taylor Swift <U+200B>cardigan folklo~ 2020 2020-07-24 00:00:00 vint~
## 2 Taylor Swift <U+200B>exile folklo~ 2020 2020-07-24 00:00:00 just~
## 3 Taylor Swift Lover Lover 2019 2019-08-16 00:00:00 we c~
## 4 Taylor Swift <U+200B>the 1 folklo~ 2020 2020-07-24 00:00:00 i'm ~
## 5 Taylor Swift Look What You Made Me Do reputa~ 2017 2017-08-25 00:00:00 i do~
## 6 Taylor Swift <U+200B>betty folklo~ 2020 2020-07-24 00:00:00 bett~
## 7 Taylor Swift End Game reputa~ 2017 2017-11-10 00:00:00 tayl~
## 8 Taylor Swift ME! Lover 2019 2019-04-26 00:00:00 tayl~
## 9 Taylor Swift You Need To Calm Down Lover 2019 2019-06-14 00:00:00 you ~
## 10 Taylor Swift <U+200B>august folklo~ 2020 2020-07-24 00:00:00 salt~
## # ... with 765 more rows
Because there are some words that are not important, I will get rid of them by creating a tibble contain unneeded words or noises.
# custom stopwords
custom.stopwords <- tibble(word = c("sheeran", "ed", "pre",
"05", "taylor", "swift", "ye",
"oh", "ey", "ohoh", "eh", "gaga",
"lady"))
Tokenizing the data is important to create n-gram and bigram. Also I remove stopwords in this process.
# tokenizing
td.tokens <- td %>%
unnest_tokens(word, Lyric) %>%
anti_join(stop_words) %>%
anti_join(custom.stopwords)
## Joining, by = "word"
## Joining, by = "word"
td.tokens
## # A tibble: 81,656 x 2
## Artist word
## <chr> <chr>
## 1 Taylor Swift vintage
## 2 Taylor Swift tee
## 3 Taylor Swift brand
## 4 Taylor Swift phone
## 5 Taylor Swift heels
## 6 Taylor Swift cobblestones
## 7 Taylor Swift assume
## 8 Taylor Swift sequin
## 9 Taylor Swift smile
## 10 Taylor Swift black
## # ... with 81,646 more rows
Let’s the top words
# top words
df.topwords <- td.tokens %>%
count(word, sort = T) %>%
top_n(100) %>%
as.data.frame() %>%
select(-Artist)
## Selecting by n
head(df.topwords)
## # A tibble: 6 x 2
## word n
## <chr> <int>
## 1 love 1114
## 2 love 738
## 3 baby 609
## 4 time 586
## 5 baby 324
## 6 time 315
I choose wordcloud2 library because they produce beautiful wordcloud
# wordcloud
library(wordcloud2)
wordcloud2(df.topwords, size = 2, minRotation = -pi/6, maxRotation = -pi/6,
rotateRatio = 0)
Let’s see the top words of the data by creating n-gram and visualizing it using ggplot2
# top words bar plot
td.tokens %>%
count(word, sort = T) %>%
group_by(Artist) %>%
top_n(10) %>%
ungroup %>%
mutate(Artist = as.factor(Artist),
word = reorder_within(word, n, Artist)) %>%
ggplot(aes(word, n, fill = Artist)) +
geom_col(show.legend = F) +
facet_wrap(~Artist, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
scale_y_continuous(expand = c(0,0)) +
labs(y = "Number of Words Per Artist",
x = NULL,
title = "Most common words")
## Selecting by n
## TF-IDF N-Gram
TF (Term Frequency) - IDF (Inverse Document Matrix) is a technique to evaluates if the word relevant and importance in a document
td.tokens.tfidf <- td.tokens %>%
count(Artist, word) %>%
bind_tf_idf(word, Artist, n)
td.tokens.tfidf[5000:5010,]
## # A tibble: 11 x 6
## Artist word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Ed Sheeran wasted 9 0.000250 0 0
## 2 Ed Sheeran wasting 23 0.000640 0 0
## 3 Ed Sheeran watch 49 0.00136 0 0
## 4 Ed Sheeran watched 23 0.000640 0 0
## 5 Ed Sheeran watches 1 0.0000278 0 0
## 6 Ed Sheeran watching 40 0.00111 0 0
## 7 Ed Sheeran water 79 0.00220 0 0
## 8 Ed Sheeran waters 1 0.0000278 0 0
## 9 Ed Sheeran waterside 1 0.0000278 0.693 0.0000193
## 10 Ed Sheeran wave 3 0.0000834 0 0
## 11 Ed Sheeran waves 1 0.0000278 0 0
Let’s visualize the TF-IDF
# visualize tf idf
td.tokens.tfidf %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(Artist) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = Artist)) +
geom_col(show.legend = F) +
labs(x = NULL, y = "TF-IDF") +
facet_wrap(~Artist, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
Bigram or digram is a two-word sequence of words such as “i like” and “you did”.
Let’s see the most frequent of bigram
# bigrams
td.bigrams <- td %>%
unnest_tokens(bigram, Lyric, token = "ngrams", n = 2)
# bigrams count
td.bigrams %>%
count(bigram, sort = T) %>%
head()
## # A tibble: 6 x 3
## Artist bigram n
## <chr> <chr> <int>
## 1 Taylor Swift in the 655
## 2 Taylor Swift and i 640
## 3 Ed Sheeran and i 417
## 4 Ed Sheeran i don't 374
## 5 Ed Sheeran you need 372
## 6 Ed Sheeran on the 347
We need to separate bigrams to remove stopwords
# separate bigrams into two columns
td.bigrams.separated <- td.bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
td.bigrams.separated <- td.bigrams.separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word1 %in% custom.stopwords$word) %>%
filter(!word2 %in% custom.stopwords$word)
td.bigrams.separated %>%
head()
## # A tibble: 6 x 3
## Artist word1 word2
## <chr> <chr> <chr>
## 1 Taylor Swift vintage tee
## 2 Taylor Swift tee brand
## 3 Taylor Swift sequin smile
## 4 Taylor Swift smile black
## 5 Taylor Swift black lipstick
## 6 Taylor Swift lipstick sensual
Let’s count the frequency of cleaned bigrams
# bigrams count
td.bigrams.separated.count <- td.bigrams.separated %>%
count(word1, word2, sort = T)
td.bigrams.separated.count %>%
head()
## # A tibble: 6 x 4
## Artist word1 word2 n
## <chr> <chr> <chr> <int>
## 1 Ed Sheeran na na 93
## 2 Taylor Swift na na 86
## 3 Taylor Swift la la 71
## 4 Taylor Swift tim mcgraw 66
## 5 Taylor Swift bad blood 59
## 6 Taylor Swift stay stay 59
Unite back the bigrams because we want to visualize it
# unite bigrams
td.bigrams.united <- td.bigrams.separated %>%
unite(bigram, word1, word2, sep = " ")
td.bigrams.united %>%
head()
## # A tibble: 6 x 2
## Artist bigram
## <chr> <chr>
## 1 Taylor Swift vintage tee
## 2 Taylor Swift tee brand
## 3 Taylor Swift sequin smile
## 4 Taylor Swift smile black
## 5 Taylor Swift black lipstick
## 6 Taylor Swift lipstick sensual
Let’s visualize it
# bi gram plot
td.bigrams.united %>%
count(bigram, sort = T) %>%
group_by(Artist) %>%
top_n(10) %>%
ungroup %>%
mutate(Artist = as.factor(Artist),
bigram = reorder_within(bigram, n, Artist)) %>%
ggplot(aes(bigram, n, fill = Artist)) +
geom_col(show.legend = F) +
facet_wrap(~Artist, scales = "free") +
coord_flip() +
scale_x_reordered() +
scale_y_continuous(expand = c(0,0)) +
labs(y = "Number of Words Bigram Per Artist",
x = NULL,
title = "Most common bigram")
## Selecting by n
We can analyze bigrams for exploratory analysis, for example we only want to see the word “love” in the second of bigram
# analyzing bigrams
td.bigrams.separated %>%
filter(word2 == "love") %>%
count(Artist, word1, sort = T) %>%
head()
## # A tibble: 6 x 3
## Artist word1 n
## <chr> <chr> <int>
## 1 Ed Sheeran found 38
## 2 Taylor Swift gonna 24
## 3 Ed Sheeran sells 17
## 4 Taylor Swift mad 17
## 5 Ed Sheeran love 13
## 6 Ed Sheeran night 12
Let’s see the tf-idf
# tf idf bigrams
td.bigrams.united.tfidf <- td.bigrams.united %>%
count(Artist, bigram) %>%
bind_tf_idf(bigram, Artist, n) %>%
arrange(desc(tf_idf))
td.bigrams.united.tfidf %>%
head()
## # A tibble: 6 x 6
## Artist bigram n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Taylor Swift tim mcgraw 66 0.00487 0.693 0.00338
## 2 Taylor Swift bad blood 59 0.00436 0.693 0.00302
## 3 Taylor Swift stay stay 59 0.00436 0.693 0.00302
## 4 Ed Sheeran beautiful people 43 0.00400 0.693 0.00277
## 5 Ed Sheeran day discovering 43 0.00400 0.693 0.00277
## 6 Ed Sheeran galway girl 40 0.00372 0.693 0.00258
# visualize tf idf
td.bigrams.united.tfidf %>%
group_by(Artist) %>%
slice_max(tf_idf, n = 10) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(bigram, tf_idf), fill = Artist)) +
geom_col(show.legend = F) +
facet_wrap(~Artist, scales = "free") +
labs(x = "tf-idf", y = NULL)
Okay now I want to compare the word frequency between Taylor and Ed
Let’s count the frequency
# word freq comparison
td.tokens.freq <- td.tokens %>%
count(Artist, word, sort = T)
td.tokens.freq %>%
head()
## # A tibble: 6 x 3
## Artist word n
## <chr> <chr> <int>
## 1 Ed Sheeran love 1114
## 2 Taylor Swift love 738
## 3 Taylor Swift baby 609
## 4 Taylor Swift time 586
## 5 Ed Sheeran baby 324
## 6 Ed Sheeran time 315
# comparison
td.comparison <- td.tokens.freq %>%
add_count(Artist, wt = n, name = "total_word") %>%
mutate(proportion = n / total_word) %>%
select(-total_word, -n) %>%
pivot_wider(names_from = Artist, values_from = proportion,
values_fill = list(proportion = 0)) %>%
pivot_longer(3:3, names_to = "other", values_to = "proportion")
td.comparison %>%
head()
## # A tibble: 6 x 4
## word `Ed Sheeran` other proportion
## <chr> <dbl> <chr> <dbl>
## 1 love 0.0310 Taylor Swift 0.0161
## 2 baby 0.00901 Taylor Swift 0.0133
## 3 time 0.00876 Taylor Swift 0.0128
## 4 yeah 0.00848 Taylor Swift 0.00619
## 5 feel 0.00779 Taylor Swift 0.00464
## 6 night 0.00509 Taylor Swift 0.00613
Visualize the plot
# plot
td.comparison %>%
filter(proportion > 1 / 1e5) %>%
ggplot(aes(proportion, `Ed Sheeran`)) +
geom_abline(color = "blue", lty = 2) +
geom_jitter(aes(color = abs(`Ed Sheeran` - proportion)),
alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = T, vjust = 1.5) +
scale_x_log10(labels = label_percent()) +
scale_y_log10(labels = label_percent()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~ other) +
guides(color =F)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 3875 rows containing missing values (geom_point).
They both have smiliar frequencies when talking about “love”, “night”, and “baby”. I think it’s so common in pop songs.
We can do simple correlation test to test whether the word frequencies are related
# correlation test
cor.test(data = filter(td.comparison, other == "Taylor Swift"),
~ proportion + `Ed Sheeran`)
##
## Pearson's product-moment correlation
##
## data: proportion and Ed Sheeran
## t = 127.35, df = 9089, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7930283 0.8077979
## sample estimates:
## cor
## 0.8005346
Well they are positively correlated
Silge, J., & Robinson, D. (2016). tidytext: Text Mining and Analysis Using Tidy Data Principles in R. The Journal of Open Source Software, 1(3), 37. https://doi.org/10.21105/joss.00037
Wickham, H. (2021). forcats: Tools for Working with Categorical Variables (Factors). https://cran.r-project.org/package=forcats
Wickham, H., Averick, M., Bryan, J., Chang, W., McGowan, L., François, R., Grolemund, G., Hayes, A., Henry, L., Hester, J., Kuhn, M., Pedersen, T., Miller, E., Bache, S., Müller, K., Ooms, J., Robinson, D., Seidel, D., Spinu, V., … Yutani, H. (2019). Welcome to the Tidyverse. Journal of Open Source Software, 4(43), 1686. https://doi.org/10.21105/joss.01686
Wickham, H., & Seidel, D. (2022). scales: Scale Functions for Visualization. https://cran.r-project.org/package=scales