knitr::opts_chunk$set(echo = TRUE, 
                      warning = F, 
                      message = F, 
                      fig.align = "center")

# Loading needed packages
pacman::p_load(tidyverse, tidytext)

# Reading in the taylor swift data set and removing rows with no lyrics
swift <- 
  taylor::taylor_album_songs |> 
  # Renaming some columns and keeping the important 3
  dplyr::select(album = album_name, track_name, lyrics) |> 
  # Changing the lyrics from a list per row to 1 line of song per row
  unnest(lyrics) |> 
  # Turning the lyric into sentences
  unnest_tokens(word, lyric, token = "words") |> 
  # Ordering the levels of album to the order they appear in the data
  # (chronologically)
  mutate(
    #album = str_remove(album, "\\(Taylor's Version\\)"),
    album = as_factor(album),
  )
tibble(swift)
## # A tibble: 88,683 × 6
##    album        track_name  line element element_artist word  
##    <fct>        <chr>      <int> <chr>   <chr>          <chr> 
##  1 Taylor Swift Tim McGraw     1 Verse 1 Taylor Swift   he    
##  2 Taylor Swift Tim McGraw     1 Verse 1 Taylor Swift   said  
##  3 Taylor Swift Tim McGraw     1 Verse 1 Taylor Swift   the   
##  4 Taylor Swift Tim McGraw     1 Verse 1 Taylor Swift   way   
##  5 Taylor Swift Tim McGraw     1 Verse 1 Taylor Swift   my    
##  6 Taylor Swift Tim McGraw     1 Verse 1 Taylor Swift   blue  
##  7 Taylor Swift Tim McGraw     1 Verse 1 Taylor Swift   eyes  
##  8 Taylor Swift Tim McGraw     1 Verse 1 Taylor Swift   shined
##  9 Taylor Swift Tim McGraw     2 Verse 1 Taylor Swift   put   
## 10 Taylor Swift Tim McGraw     2 Verse 1 Taylor Swift   those 
## # ℹ 88,673 more rows

Word Fequency

term frequency (tf): How frequently a word occurs in a document

Inverse Document Frequency (idf): decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents.

This can be combined with term frequency to calculate a term’s tf-idf (the two quantities multiplied together), the frequency of a term adjusted for how rarely it is used


The statistic tf-idf is intended to measure how important a word is to a document in a collection (or corpus) of documents, for example, to one novel in a collection of novels or to one website in a collection of websites

\[tf(\textrm{term}) = \frac{n_{\textrm{term in corpus}}}{n_{\textrm{tokens in corpus}}}\]

\[idf(\textrm{term}) = \ln\left(\frac{n_{\textrm{corpuses}}}{n_{\textrm{term in corpuses}}}\right)\]

We’ll look at how frequently each term occurs in Taylor Swift’s eleven main release studio albums to try to find the most important words using tf-idf

We’ll start by finding the total number of words per album and how often each unique word occurs per album:

swift_word_count <- 
  swift |> 
  # How often each word occurred per album
  summarize(
    .by = c(album, word),
    count = n()
  ) |> 
  # adding the total number of words per album column
  mutate(
    .by = album,
    total_words = sum(count)
  ) 

set.seed(1234)
slice_sample(swift_word_count, n = 10) |> arrange(album) |> gt::gt()
album word count total_words
Taylor Swift she'll 1 4305
Fearless (Taylor's Version) garden 1 8893
Red (Taylor's Version) room 13 11400
Lover joke 1 6844
folklore do 14 5148
folklore angry 4 5148
folklore seventeen 3 5148
folklore leaving 3 5148
evermore receipt 1 6023
THE TORTURED POETS DEPARTMENT cause 42 10801

Next, we’ll apply bind_tf_idf() from tidytext to find idf and tf_idf

swift_tf_idf <- 
  bind_tf_idf(
    tbl = swift_word_count,
    term = word, 
    document = album, 
    n = count
  )

# Looking at the most important words per album
swift_tf_idf |> 
  slice_max(
    by = album,
    order_by = tf_idf,
    n = 2,
    with_ties = F
  )
## # A tibble: 22 × 7
##    album                        word     count total_words      tf   idf  tf_idf
##    <fct>                        <chr>    <int>       <int>   <dbl> <dbl>   <dbl>
##  1 Taylor Swift                 tim          6        4305 0.00139 2.40  0.00334
##  2 Taylor Swift                 mcgraw       6        4305 0.00139 2.40  0.00334
##  3 Fearless (Taylor's Version)  bye         26        8893 0.00292 2.40  0.00701
##  4 Fearless (Taylor's Version)  fairyta…    19        8893 0.00214 2.40  0.00512
##  5 Speak Now (Taylor's Version) emma         9        8848 0.00102 2.40  0.00244
##  6 Speak Now (Taylor's Version) timeless     9        8848 0.00102 2.40  0.00244
##  7 Red (Taylor's Version)       trouble     34       11400 0.00298 2.40  0.00715
##  8 Red (Taylor's Version)       red        115       11400 0.0101  0.606 0.00611
##  9 1989 (Taylor's Version)      woods       38       10106 0.00376 2.40  0.00902
## 10 1989 (Taylor's Version)      shake       70       10106 0.00693 0.788 0.00546
## # ℹ 12 more rows

We can also calculate the tf_idf ourselves:

# Getting the number of books in the data
n_albums <- length(unique(swift$album))
album_tf_idf <- 
  swift_word_count |> 
  # Counting how many albums each word occurs in
  summarize(
    .by = word,
    occurs_in = n_distinct(album)
  ) |> 
  # Arranging from fewest albums to most albums
  arrange(occurs_in) |> 
  # Calculating the idf for each word
  transmute(
    word,
    idf = log(n_albums/occurs_in)
  ) |> 
  # Adding the idf back to swift_word_count
  right_join(
    y = swift_word_count,
    by = "word"
  ) |> 
  # Calculating the term frequency and tf_idf
  mutate(
    # tf = count/word total per album
    tf = count/total_words,
    tf_idf = tf * idf
  ) |> 
  # Reordering the columns better
  dplyr::select(album, total_words, word, count, tf, idf, tf_idf) |> 
  arrange(tf_idf)

album_tf_idf |> 
  slice_max(
    by = album,
    order_by = tf_idf,
    n = 2,
    with_ties = F
  )
## # A tibble: 22 × 7
##    album                        total_words word     count      tf   idf  tf_idf
##    <fct>                              <int> <chr>    <int>   <dbl> <dbl>   <dbl>
##  1 Taylor Swift                        4305 tim          6 0.00139 2.40  0.00334
##  2 Taylor Swift                        4305 mcgraw       6 0.00139 2.40  0.00334
##  3 Fearless (Taylor's Version)         8893 bye         26 0.00292 2.40  0.00701
##  4 Fearless (Taylor's Version)         8893 fairyta…    19 0.00214 2.40  0.00512
##  5 Speak Now (Taylor's Version)        8848 emma         9 0.00102 2.40  0.00244
##  6 Speak Now (Taylor's Version)        8848 timeless     9 0.00102 2.40  0.00244
##  7 Red (Taylor's Version)             11400 trouble     34 0.00298 2.40  0.00715
##  8 Red (Taylor's Version)             11400 red        115 0.0101  0.606 0.00611
##  9 1989 (Taylor's Version)            10106 woods       38 0.00376 2.40  0.00902
## 10 1989 (Taylor's Version)            10106 shake       70 0.00693 0.788 0.00546
## # ℹ 12 more rows

Same result as using the function! So why do it “by hand?” Mainly to show what idf and tf_idf are!

Visualizing the important words by album

We’ll visualize the ten most important words per album by with the plots below:

gg_swift_words <- 
  swift_tf_idf |> 
  slice_max(
    by = album,
    order_by = tf_idf,
    n = 10,
    with_ties = F
  ) |> 
  ggplot(
    mapping = aes(
      x = tf_idf,
      # reorder_within is in the tidytext package. 
      # Make sure to use correct scale function later!
      y = reorder_within(word, tf_idf, album),
      fill = album
    )
  ) + 
  geom_col(show.legend = F)  +
  facet_wrap(
    facets = vars(album),
    ncol = 2,
    scales = "free_y"
  ) + 
  labs(
    x = "tf-idf",
    y = NULL,
    title = "Important Words by Taylor Swift Album"
  ) + 
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5, size = 16)) +
  # Need to use scale_y_reordered to get it to display the correct values
  # on the y-axis
  scale_y_reordered() + 
  taylor::scale_fill_albums() + 
  coord_cartesian(expand = F)

gg_swift_words +
  geom_text(
    mapping = aes(label = count),
    #contrast = T,
    color = "white",
    nudge_x = -0.0005
  )

Some of the more important words are “filler” words occasionally used in songs (like di, ho, and ra). Let’s recreate the graph with the filler words removed

# Vector of filler words
filler_words <- c(
  # taylor swift and fearless albums
  "na", "mr", "mm", "la",
  # Speak now and red
  "la", "ho", "oh", "hoo", 
  # 1989 and reputation
  "di", "da", "ra", "whoa", "ha", 
  # Lover and folklore
  "eeh", "e" 
  # evermore, midnights, and TTPD
  
  )

# Taking the old graph and replacing the data set in it
(
  gg_swift_words + 
    geom_text(
      mapping = aes(label = count),
      #contrast = T,
      color = "white",
      nudge_x = -0.00045
    )
)  %+%
  (
    swift_tf_idf |> 
      # Removing the filler words
      filter(
        !word %in% filler_words
      ) |> 
      slice_max(
        by = album,
        order_by = tf_idf,
        n = 10,
        with_ties = F
      )
  )

I’m not a Taylor Swift fan, so if these words are helpful for understanding what each album is about, a fan would be more helpful there.

One noticable word is red, which occurs 115 times in the album of the same name. That alone isn’t surprising. What does stand out is that the tf_idf value isn’t that large when compared to some of the other words, like daylight and florida, which occurred 40 and 20 times, respectively. So why does red have a very high occurrence (tf is about 0.01) and low tf-idf value?

It occurs in six of the eleven albums, giving it a low idf value of about 0.6, so the tf-idf value is 0.006. It’s high enough to be included, but not the largest by any means (daylight has the largest tf-idf of 0.014 since it only occurs on a single album).