Explore Data
library(tidyverse)
lyrics <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-12-14/lyrics.csv")
lyrics %>% distinct(album_name)
## # A tibble: 3 × 1
## album_name
## <chr>
## 1 Spice
## 2 Spiceworld
## 3 Forever
lyrics %>% distinct(album_name, song_name)
## # A tibble: 31 × 2
## album_name song_name
## <chr> <chr>
## 1 Spice "Wannabe"
## 2 Spice "Say You\x92ll Be There"
## 3 Spice "2 Become 1"
## 4 Spice "Love Thing"
## 5 Spice "Last Time Lover"
## 6 Spice "Mama"
## 7 Spice "Who Do You Think You Are"
## 8 Spice "Something Kinda Funny"
## 9 Spice "Naked"
## 10 Spice "If U Can\x92t Dance"
## # ℹ 21 more rows
library(tidytext)
tidy_lyrics <-
lyrics %>%
mutate(song_name = str_replace_all(song_name, "\x92", "'")) %>%
unnest_tokens(word, line) %>%
anti_join(get_stopwords())
tidy_lyrics %>% count(word, sort = TRUE)
## # A tibble: 979 × 2
## word n
## <chr> <int>
## 1 get 153
## 2 love 137
## 3 know 124
## 4 time 106
## 5 wanna 102
## 6 never 101
## 7 oh 88
## 8 yeah 88
## 9 la 85
## 10 got 82
## # ℹ 969 more rows
tidy_lyrics %>% count(song_name, word, sort = TRUE)
## # A tibble: 2,206 × 3
## song_name word n
## <chr> <chr> <int>
## 1 Saturday Night Divas get 91
## 2 Spice Up Your Life la 64
## 3 If U Can't Dance dance 60
## 4 Holler holler 48
## 5 Never Give Up on the Good Times never 47
## 6 Move Over generation 41
## 7 Saturday Night Divas deeper 41
## 8 Move Over yeah 39
## 9 Something Kinda Funny got 39
## 10 Never Give Up on the Good Times give 38
## # ℹ 2,196 more rows
Train topic model
lyrics_sparse <-
tidy_lyrics %>%
count(song_name, word) %>%
cast_sparse(song_name, word, n)
library(stm)
topic_model <- stm(lyrics_sparse, K = 4)
## Beginning Spectral Initialization
## Calculating the gram matrix...
## Finding anchor words...
## ....
## Recovering initialization...
## .........
## Initialization complete.
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 1 (approx. per word bound = -5.709)
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 2 (approx. per word bound = -5.338, relative change = 6.486e-02)
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 3 (approx. per word bound = -5.237, relative change = 1.893e-02)
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 4 (approx. per word bound = -5.214, relative change = 4.519e-03)
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 5 (approx. per word bound = -5.198, relative change = 2.955e-03)
## Topic 1: get, wanna, time, right, night
## Topic 2: dance, yeah, know, love, generation
## Topic 3: got, love, holler, make, wanna
## Topic 4: la, never, give, love, time
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 6 (approx. per word bound = -5.187, relative change = 2.100e-03)
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 7 (approx. per word bound = -5.184, relative change = 6.849e-04)
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 8 (approx. per word bound = -5.183, relative change = 1.905e-04)
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 9 (approx. per word bound = -5.183, relative change = 4.775e-05)
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 10 (approx. per word bound = -5.182, relative change = 1.627e-05)
## Topic 1: get, wanna, time, right, night
## Topic 2: dance, yeah, love, know, generation
## Topic 3: got, love, holler, make, oh
## Topic 4: la, never, give, time, know
## ...............................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Model Converged
Explore topic model results
song_topics <- tidy(topic_model, matrix = "beta")
song_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
mutate(topic = paste("Topic", topic)) %>%
ggplot(aes(beta, reorder_within(term, beta, topic), fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(topic), scales = "free_y") +
scale_y_reordered() +
labs(x = expression(beta), y = NULL)

song_topics <- tidy(topic_model, matrix = "gamma",
document_names = rownames(lyrics_sparse))
song_topics %>%
mutate(document = fct_reorder(document, gamma),
topic = factor(topic)) %>%
ggplot(aes(gamma, topic, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(document), ncol = 4) +
labs(x = expression(gamma), y = "Topic")

effects <-
estimateEffect(
1:4 ~ album_name,
topic_model,
tidy_lyrics %>% distinct(song_name, album_name) %>% arrange (song_name)
)
summary(effects)
##
## Call:
## estimateEffect(formula = 1:4 ~ album_name, stmobj = topic_model,
## metadata = tidy_lyrics %>% distinct(song_name, album_name) %>%
## arrange(song_name))
##
##
## Topic 1:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.20909 0.13360 1.565 0.129
## album_nameSpice 0.04241 0.19227 0.221 0.827
## album_nameSpiceworld 0.09401 0.18854 0.499 0.622
##
##
## Topic 2:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1693 0.1334 1.269 0.215
## album_nameSpice 0.1417 0.1928 0.735 0.469
## album_nameSpiceworld 0.1199 0.1888 0.635 0.531
##
##
## Topic 3:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2195 0.1181 1.858 0.0737 .
## album_nameSpice 0.1000 0.1628 0.614 0.5440
## album_nameSpiceworld -0.1997 0.1594 -1.252 0.2208
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Topic 4:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.40273 0.14412 2.794 0.00928 **
## album_nameSpice -0.28209 0.19975 -1.412 0.16891
## album_nameSpiceworld -0.01345 0.20230 -0.066 0.94748
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1