Tokenizing the data

tidy_taylor <- 
  swift |> 
  # tokenizing the lyrics of each song
  unnest_tokens(output = word, input = lyrics)


tibble(tidy_taylor)
## # A tibble: 56,632 × 4
##    index album        song_name                 word 
##    <int> <chr>        <chr>                     <chr>
##  1     0 Taylor Swift Mary's Song (Oh My My My) she  
##  2     0 Taylor Swift Mary's Song (Oh My My My) said 
##  3     0 Taylor Swift Mary's Song (Oh My My My) i    
##  4     0 Taylor Swift Mary's Song (Oh My My My) was  
##  5     0 Taylor Swift Mary's Song (Oh My My My) seven
##  6     0 Taylor Swift Mary's Song (Oh My My My) and  
##  7     0 Taylor Swift Mary's Song (Oh My My My) you  
##  8     0 Taylor Swift Mary's Song (Oh My My My) were 
##  9     0 Taylor Swift Mary's Song (Oh My My My) nine 
## 10     0 Taylor Swift Mary's Song (Oh My My My) i    
## # ℹ 56,622 more rows

Next we’ll remove any stop words (the, a, etc…)

# Removing the stop words using anti_join and the stop_words data set
swift_no_stop <- 
  anti_join(
    x = tidy_taylor,
    y = stop_words,
    by = "word"
  )

tibble(swift_no_stop)
## # A tibble: 15,911 × 4
##    index album        song_name                 word   
##    <int> <chr>        <chr>                     <chr>  
##  1     0 Taylor Swift Mary's Song (Oh My My My) looked 
##  2     0 Taylor Swift Mary's Song (Oh My My My) stars  
##  3     0 Taylor Swift Mary's Song (Oh My My My) shined 
##  4     0 Taylor Swift Mary's Song (Oh My My My) sky    
##  5     0 Taylor Swift Mary's Song (Oh My My My) pretty 
##  6     0 Taylor Swift Mary's Song (Oh My My My) lights 
##  7     0 Taylor Swift Mary's Song (Oh My My My) daddies
##  8     0 Taylor Swift Mary's Song (Oh My My My) joke   
##  9     0 Taylor Swift Mary's Song (Oh My My My) growing
## 10     0 Taylor Swift Mary's Song (Oh My My My) falling
## # ℹ 15,901 more rows

Let’s look to see if Taylor Swift’s top 100 words in her lyrics (without stop words) follows Zipf’s law. According to Zipf’s law, how frequently each word appears (relative to the most common word) should be about

\[\textrm{word frequency} \propto \frac{1}{\textrm{word rank}}\]

Let’s start by counting how many times each word occurs and then keeping the top 10

# How often each of the top 100 words occurs
swift_top_words <- 
  #tidy_taylor |> 
  swift_no_stop |> 
  # Counting the number of times each word appears
  count(word, sort = T) |> 
  # Keeping the top 100 (and ordering them descendingly)
  slice(1:100) |> 
  # Adding the rank using row_number() 
  mutate(
    rank = row_number(),
    inv_rank = 1/rank)

tibble(swift_top_words)
## # A tibble: 100 × 4
##    word      n  rank inv_rank
##    <chr> <int> <int>    <dbl>
##  1 love    272     1    1    
##  2 time    242     2    0.5  
##  3 ooh     240     3    0.333
##  4 wanna   154     4    0.25 
##  5 yeah    154     5    0.2  
##  6 baby    143     6    0.167
##  7 gonna   138     7    0.143
##  8 ah      125     8    0.125
##  9 stay    119     9    0.111
## 10 bad     104    10    0.1  
## # ℹ 90 more rows
# Looking at the correlation between word frequency and rank
cat(
  "Correlation between word frequency and the inverse rank:", 
  with(swift_top_words, round(cor(n, inv_rank), 2))
)
## Correlation between word frequency and the inverse rank: 0.89

So there is a decently strong relationship between word frequency and the inverse rank of the word.

Next, let’s make a line graph to display how frequently each of her top 100 words occur and what we’d expect if Zipf’s law was true:

# Plotting the results
ggplot(
  data = swift_top_words,
  mapping = aes(
    x = rank,
    y = n
  )
) + 
  geom_line(
    mapping = aes(color = "Actual Word Frequency")
  ) + 
  geom_line(
    mapping = aes(
      y =  inv_rank * max(n),
      color = "Expected Zipf's Law"
    )
  ) +
  labs(
    x = "Word Rank",
    y = "Word Frequency",
    title = "Word Frequency for Taylor Swift Lyrics",
    subtitle = "Excluding 'Stop Words'",
    color = NULL
  ) + 
  theme_bw() + 
  theme(
    plot.title = element_text(hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "inside",
    legend.position.inside = c(0.8, 0.3)
  ) + 
  scale_color_manual(
    values = c("Actual Word Frequency" = "steelblue", 
               "Expected Zipf's Law" = "tomato")
  ) + 
  scale_y_continuous(
    breaks = seq(from = 0, to = round(max(swift_top_words$n)/250) * 250, length.out = 6)
  ) + 
  scale_x_continuous(
    breaks = seq(from = 0, to = 100, by = 10),
    expand = c(0.025, 0)
  )

Overall, after the first 5 most common words, the curved line in the expected frequency mimics the actual word frequency pretty closely, indicating that Zipf’s law models her lyrics (without stop word) pretty well!

Let’s look at a scatter plot as well:

ggplot(
  data = swift_top_words,
  mapping = aes(
    x = inv_rank * max(n),
    y = n
  )
) + 
  geom_point() + 
  geom_smooth(
    method = "lm",
    formula = y ~ x,
    se = F
  ) +
  ggrepel::geom_text_repel(
    # Just displaying the 10 most common words
    data = swift_top_words |> slice_max(n, n = 10),
    mapping = aes(label = word),
    nudge_y = .1
  ) + 
  annotate(
    geom = "text",
    x = 100,
    y = 100,
    label = "x and y-axes are in log10 scale",
    fontface = "bold"
  ) + 
  labs(
    x = "Expected Frequency by Zipf's Law",
    y = "Actual Frequency",
    title = "Does Zipf's Law Explain the Frequency of Taylor Swift Lyrics?",
    subtitle = "Lyrics Exlcudes Any 'Stop Words'"
  ) + 
  theme_classic() + 
  theme(
    plot.title = element_text(hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5)
  ) +
  scale_x_log10() + 
  scale_y_log10()  

After log transforming both the x and y-axes, there is a strong, relatively straight line relationship between the actual and expected frequencies!

Word Cloud

Finally, let’s create a word cloud for Swift’s most commonly used non-stop words:

wordcloud2(
  data = swift_top_words |> rename(freq = n),
  color = "random-light",
  backgroundColor = "black",
)