tidy_taylor <-
swift |>
# tokenizing the lyrics of each song
unnest_tokens(output = word, input = lyrics)
tibble(tidy_taylor)
## # A tibble: 56,632 × 4
## index album song_name word
## <int> <chr> <chr> <chr>
## 1 0 Taylor Swift Mary's Song (Oh My My My) she
## 2 0 Taylor Swift Mary's Song (Oh My My My) said
## 3 0 Taylor Swift Mary's Song (Oh My My My) i
## 4 0 Taylor Swift Mary's Song (Oh My My My) was
## 5 0 Taylor Swift Mary's Song (Oh My My My) seven
## 6 0 Taylor Swift Mary's Song (Oh My My My) and
## 7 0 Taylor Swift Mary's Song (Oh My My My) you
## 8 0 Taylor Swift Mary's Song (Oh My My My) were
## 9 0 Taylor Swift Mary's Song (Oh My My My) nine
## 10 0 Taylor Swift Mary's Song (Oh My My My) i
## # ℹ 56,622 more rows
Next we’ll remove any stop words (the, a, etc…)
# Removing the stop words using anti_join and the stop_words data set
swift_no_stop <-
anti_join(
x = tidy_taylor,
y = stop_words,
by = "word"
)
tibble(swift_no_stop)
## # A tibble: 15,911 × 4
## index album song_name word
## <int> <chr> <chr> <chr>
## 1 0 Taylor Swift Mary's Song (Oh My My My) looked
## 2 0 Taylor Swift Mary's Song (Oh My My My) stars
## 3 0 Taylor Swift Mary's Song (Oh My My My) shined
## 4 0 Taylor Swift Mary's Song (Oh My My My) sky
## 5 0 Taylor Swift Mary's Song (Oh My My My) pretty
## 6 0 Taylor Swift Mary's Song (Oh My My My) lights
## 7 0 Taylor Swift Mary's Song (Oh My My My) daddies
## 8 0 Taylor Swift Mary's Song (Oh My My My) joke
## 9 0 Taylor Swift Mary's Song (Oh My My My) growing
## 10 0 Taylor Swift Mary's Song (Oh My My My) falling
## # ℹ 15,901 more rows
Let’s look to see if Taylor Swift’s top 100 words in her lyrics (without stop words) follows Zipf’s law. According to Zipf’s law, how frequently each word appears (relative to the most common word) should be about
\[\textrm{word frequency} \propto \frac{1}{\textrm{word rank}}\]
Let’s start by counting how many times each word occurs and then keeping the top 10
# How often each of the top 100 words occurs
swift_top_words <-
#tidy_taylor |>
swift_no_stop |>
# Counting the number of times each word appears
count(word, sort = T) |>
# Keeping the top 100 (and ordering them descendingly)
slice(1:100) |>
# Adding the rank using row_number()
mutate(
rank = row_number(),
inv_rank = 1/rank)
tibble(swift_top_words)
## # A tibble: 100 × 4
## word n rank inv_rank
## <chr> <int> <int> <dbl>
## 1 love 272 1 1
## 2 time 242 2 0.5
## 3 ooh 240 3 0.333
## 4 wanna 154 4 0.25
## 5 yeah 154 5 0.2
## 6 baby 143 6 0.167
## 7 gonna 138 7 0.143
## 8 ah 125 8 0.125
## 9 stay 119 9 0.111
## 10 bad 104 10 0.1
## # ℹ 90 more rows
# Looking at the correlation between word frequency and rank
cat(
"Correlation between word frequency and the inverse rank:",
with(swift_top_words, round(cor(n, inv_rank), 2))
)
## Correlation between word frequency and the inverse rank: 0.89
So there is a decently strong relationship between word frequency and the inverse rank of the word.
Next, let’s make a line graph to display how frequently each of her top 100 words occur and what we’d expect if Zipf’s law was true:
# Plotting the results
ggplot(
data = swift_top_words,
mapping = aes(
x = rank,
y = n
)
) +
geom_line(
mapping = aes(color = "Actual Word Frequency")
) +
geom_line(
mapping = aes(
y = inv_rank * max(n),
color = "Expected Zipf's Law"
)
) +
labs(
x = "Word Rank",
y = "Word Frequency",
title = "Word Frequency for Taylor Swift Lyrics",
subtitle = "Excluding 'Stop Words'",
color = NULL
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "inside",
legend.position.inside = c(0.8, 0.3)
) +
scale_color_manual(
values = c("Actual Word Frequency" = "steelblue",
"Expected Zipf's Law" = "tomato")
) +
scale_y_continuous(
breaks = seq(from = 0, to = round(max(swift_top_words$n)/250) * 250, length.out = 6)
) +
scale_x_continuous(
breaks = seq(from = 0, to = 100, by = 10),
expand = c(0.025, 0)
)
Overall, after the first 5 most common words, the curved line in the expected frequency mimics the actual word frequency pretty closely, indicating that Zipf’s law models her lyrics (without stop word) pretty well!
Let’s look at a scatter plot as well:
ggplot(
data = swift_top_words,
mapping = aes(
x = inv_rank * max(n),
y = n
)
) +
geom_point() +
geom_smooth(
method = "lm",
formula = y ~ x,
se = F
) +
ggrepel::geom_text_repel(
# Just displaying the 10 most common words
data = swift_top_words |> slice_max(n, n = 10),
mapping = aes(label = word),
nudge_y = .1
) +
annotate(
geom = "text",
x = 100,
y = 100,
label = "x and y-axes are in log10 scale",
fontface = "bold"
) +
labs(
x = "Expected Frequency by Zipf's Law",
y = "Actual Frequency",
title = "Does Zipf's Law Explain the Frequency of Taylor Swift Lyrics?",
subtitle = "Lyrics Exlcudes Any 'Stop Words'"
) +
theme_classic() +
theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
) +
scale_x_log10() +
scale_y_log10()
After log transforming both the x and y-axes, there is a strong, relatively straight line relationship between the actual and expected frequencies!
Finally, let’s create a word cloud for Swift’s most commonly used non-stop words:
wordcloud2(
data = swift_top_words |> rename(freq = n),
color = "random-light",
backgroundColor = "black",
)