knitr::opts_chunk$set(echo = TRUE,
warning = F,
message = F,
fig.align = "center")
# Loading needed packages
pacman::p_load(tidyverse, tidytext)
# Reading in the taylor swift data set and removing rows with no lyrics
swift <-
taylor::taylor_album_songs |>
# Renaming some columns and keeping the important 3
dplyr::select(album = album_name, track_name, lyrics) |>
# Changing the lyrics from a list per row to 1 line of song per row
unnest(lyrics) |>
# Turning the lyric into sentences
unnest_tokens(word, lyric, token = "words") |>
# Ordering the levels of album to the order they appear in the data
# (chronologically)
mutate(
#album = str_remove(album, "\\(Taylor's Version\\)"),
album = as_factor(album),
)
tibble(swift)
## # A tibble: 88,683 × 6
## album track_name line element element_artist word
## <fct> <chr> <int> <chr> <chr> <chr>
## 1 Taylor Swift Tim McGraw 1 Verse 1 Taylor Swift he
## 2 Taylor Swift Tim McGraw 1 Verse 1 Taylor Swift said
## 3 Taylor Swift Tim McGraw 1 Verse 1 Taylor Swift the
## 4 Taylor Swift Tim McGraw 1 Verse 1 Taylor Swift way
## 5 Taylor Swift Tim McGraw 1 Verse 1 Taylor Swift my
## 6 Taylor Swift Tim McGraw 1 Verse 1 Taylor Swift blue
## 7 Taylor Swift Tim McGraw 1 Verse 1 Taylor Swift eyes
## 8 Taylor Swift Tim McGraw 1 Verse 1 Taylor Swift shined
## 9 Taylor Swift Tim McGraw 2 Verse 1 Taylor Swift put
## 10 Taylor Swift Tim McGraw 2 Verse 1 Taylor Swift those
## # ℹ 88,673 more rows
term frequency (tf): How frequently a word occurs in a document
Inverse Document Frequency (idf): decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents.
This can be combined with term frequency to calculate a term’s tf-idf (the two quantities multiplied together), the frequency of a term adjusted for how rarely it is used
The statistic tf-idf is intended to measure how important a word is to a document in a collection (or corpus) of documents, for example, to one novel in a collection of novels or to one website in a collection of websites
\[tf(\textrm{term}) = \frac{n_{\textrm{term in corpus}}}{n_{\textrm{tokens in corpus}}}\]
\[idf(\textrm{term}) = \ln\left(\frac{n_{\textrm{corpuses}}}{n_{\textrm{term in corpuses}}}\right)\]
We’ll look at how frequently each term occurs in Taylor Swift’s eleven main release studio albums to try to find the most important words using tf-idf
We’ll start by finding the total number of words per album and how often each unique word occurs per album:
swift_word_count <-
swift |>
# How often each word occurred per album
summarize(
.by = c(album, word),
count = n()
) |>
# adding the total number of words per album column
mutate(
.by = album,
total_words = sum(count)
)
set.seed(1234)
slice_sample(swift_word_count, n = 10) |> arrange(album) |> gt::gt()
album | word | count | total_words |
---|---|---|---|
Taylor Swift | she'll | 1 | 4305 |
Fearless (Taylor's Version) | garden | 1 | 8893 |
Red (Taylor's Version) | room | 13 | 11400 |
Lover | joke | 1 | 6844 |
folklore | do | 14 | 5148 |
folklore | angry | 4 | 5148 |
folklore | seventeen | 3 | 5148 |
folklore | leaving | 3 | 5148 |
evermore | receipt | 1 | 6023 |
THE TORTURED POETS DEPARTMENT | cause | 42 | 10801 |
Next, we’ll apply bind_tf_idf()
from
tidytext
to find idf
and
tf_idf
swift_tf_idf <-
bind_tf_idf(
tbl = swift_word_count,
term = word,
document = album,
n = count
)
# Looking at the most important words per album
swift_tf_idf |>
slice_max(
by = album,
order_by = tf_idf,
n = 2,
with_ties = F
)
## # A tibble: 22 × 7
## album word count total_words tf idf tf_idf
## <fct> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Taylor Swift tim 6 4305 0.00139 2.40 0.00334
## 2 Taylor Swift mcgraw 6 4305 0.00139 2.40 0.00334
## 3 Fearless (Taylor's Version) bye 26 8893 0.00292 2.40 0.00701
## 4 Fearless (Taylor's Version) fairyta… 19 8893 0.00214 2.40 0.00512
## 5 Speak Now (Taylor's Version) emma 9 8848 0.00102 2.40 0.00244
## 6 Speak Now (Taylor's Version) timeless 9 8848 0.00102 2.40 0.00244
## 7 Red (Taylor's Version) trouble 34 11400 0.00298 2.40 0.00715
## 8 Red (Taylor's Version) red 115 11400 0.0101 0.606 0.00611
## 9 1989 (Taylor's Version) woods 38 10106 0.00376 2.40 0.00902
## 10 1989 (Taylor's Version) shake 70 10106 0.00693 0.788 0.00546
## # ℹ 12 more rows
We can also calculate the tf_idf ourselves:
# Getting the number of books in the data
n_albums <- length(unique(swift$album))
album_tf_idf <-
swift_word_count |>
# Counting how many albums each word occurs in
summarize(
.by = word,
occurs_in = n_distinct(album)
) |>
# Arranging from fewest albums to most albums
arrange(occurs_in) |>
# Calculating the idf for each word
transmute(
word,
idf = log(n_albums/occurs_in)
) |>
# Adding the idf back to swift_word_count
right_join(
y = swift_word_count,
by = "word"
) |>
# Calculating the term frequency and tf_idf
mutate(
# tf = count/word total per album
tf = count/total_words,
tf_idf = tf * idf
) |>
# Reordering the columns better
dplyr::select(album, total_words, word, count, tf, idf, tf_idf) |>
arrange(tf_idf)
album_tf_idf |>
slice_max(
by = album,
order_by = tf_idf,
n = 2,
with_ties = F
)
## # A tibble: 22 × 7
## album total_words word count tf idf tf_idf
## <fct> <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 Taylor Swift 4305 tim 6 0.00139 2.40 0.00334
## 2 Taylor Swift 4305 mcgraw 6 0.00139 2.40 0.00334
## 3 Fearless (Taylor's Version) 8893 bye 26 0.00292 2.40 0.00701
## 4 Fearless (Taylor's Version) 8893 fairyta… 19 0.00214 2.40 0.00512
## 5 Speak Now (Taylor's Version) 8848 emma 9 0.00102 2.40 0.00244
## 6 Speak Now (Taylor's Version) 8848 timeless 9 0.00102 2.40 0.00244
## 7 Red (Taylor's Version) 11400 trouble 34 0.00298 2.40 0.00715
## 8 Red (Taylor's Version) 11400 red 115 0.0101 0.606 0.00611
## 9 1989 (Taylor's Version) 10106 woods 38 0.00376 2.40 0.00902
## 10 1989 (Taylor's Version) 10106 shake 70 0.00693 0.788 0.00546
## # ℹ 12 more rows
Same result as using the function! So why do it “by hand?” Mainly to
show what idf
and tf_idf
are!
We’ll visualize the ten most important words per album by with the plots below:
gg_swift_words <-
swift_tf_idf |>
slice_max(
by = album,
order_by = tf_idf,
n = 10,
with_ties = F
) |>
ggplot(
mapping = aes(
x = tf_idf,
# reorder_within is in the tidytext package.
# Make sure to use correct scale function later!
y = reorder_within(word, tf_idf, album),
fill = album
)
) +
geom_col(show.legend = F) +
facet_wrap(
facets = vars(album),
ncol = 2,
scales = "free_y"
) +
labs(
x = "tf-idf",
y = NULL,
title = "Important Words by Taylor Swift Album"
) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size = 16)) +
# Need to use scale_y_reordered to get it to display the correct values
# on the y-axis
scale_y_reordered() +
taylor::scale_fill_albums() +
coord_cartesian(expand = F)
gg_swift_words +
geom_text(
mapping = aes(label = count),
#contrast = T,
color = "white",
nudge_x = -0.0005
)
Some of the more important words are “filler” words occasionally used in songs (like di, ho, and ra). Let’s recreate the graph with the filler words removed
# Vector of filler words
filler_words <- c(
# taylor swift and fearless albums
"na", "mr", "mm", "la",
# Speak now and red
"la", "ho", "oh", "hoo",
# 1989 and reputation
"di", "da", "ra", "whoa", "ha",
# Lover and folklore
"eeh", "e"
# evermore, midnights, and TTPD
)
# Taking the old graph and replacing the data set in it
(
gg_swift_words +
geom_text(
mapping = aes(label = count),
#contrast = T,
color = "white",
nudge_x = -0.00045
)
) %+%
(
swift_tf_idf |>
# Removing the filler words
filter(
!word %in% filler_words
) |>
slice_max(
by = album,
order_by = tf_idf,
n = 10,
with_ties = F
)
)
I’m not a Taylor Swift fan, so if these words are helpful for understanding what each album is about, a fan would be more helpful there.
One noticable word is red, which occurs 115 times in the
album of the same name. That alone isn’t surprising. What does stand out
is that the tf_idf
value isn’t that large when compared to
some of the other words, like daylight and florida,
which occurred 40 and 20 times, respectively. So why does red
have a very high occurrence (tf is about 0.01) and low tf-idf value?
It occurs in six of the eleven albums, giving it a low idf value of about 0.6, so the tf-idf value is 0.006. It’s high enough to be included, but not the largest by any means (daylight has the largest tf-idf of 0.014 since it only occurs on a single album).