Explore Data

library(tidyverse)

lyrics <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-12-14/lyrics.csv")
lyrics %>% distinct(album_name)
## # A tibble: 3 × 1
##   album_name
##   <chr>     
## 1 Spice     
## 2 Spiceworld
## 3 Forever
lyrics %>% distinct(album_name, song_name)
## # A tibble: 31 × 2
##    album_name song_name                 
##    <chr>      <chr>                     
##  1 Spice      "Wannabe"                 
##  2 Spice      "Say You\x92ll Be There"  
##  3 Spice      "2 Become 1"              
##  4 Spice      "Love Thing"              
##  5 Spice      "Last Time Lover"         
##  6 Spice      "Mama"                    
##  7 Spice      "Who Do You Think You Are"
##  8 Spice      "Something Kinda Funny"   
##  9 Spice      "Naked"                   
## 10 Spice      "If U Can\x92t Dance"     
## # ℹ 21 more rows
library(tidytext)

tidy_lyrics <- 
    lyrics %>% 
    mutate(song_name = str_replace_all(song_name, "\x92", "'")) %>%
    unnest_tokens(word, line) %>%
    anti_join(get_stopwords())
tidy_lyrics %>% count(word, sort = TRUE)
## # A tibble: 979 × 2
##    word      n
##    <chr> <int>
##  1 get     153
##  2 love    137
##  3 know    124
##  4 time    106
##  5 wanna   102
##  6 never   101
##  7 oh       88
##  8 yeah     88
##  9 la       85
## 10 got      82
## # ℹ 969 more rows
tidy_lyrics %>% count(song_name, word, sort = TRUE)
## # A tibble: 2,206 × 3
##    song_name                       word           n
##    <chr>                           <chr>      <int>
##  1 Saturday Night Divas            get           91
##  2 Spice Up Your Life              la            64
##  3 If U Can't Dance                dance         60
##  4 Holler                          holler        48
##  5 Never Give Up on the Good Times never         47
##  6 Move Over                       generation    41
##  7 Saturday Night Divas            deeper        41
##  8 Move Over                       yeah          39
##  9 Something Kinda Funny           got           39
## 10 Never Give Up on the Good Times give          38
## # ℹ 2,196 more rows

Train topic model

lyrics_sparse <-
    tidy_lyrics %>%
    count(song_name, word) %>%
    cast_sparse(song_name, word, n)
library(stm)

topic_model <- stm(lyrics_sparse, K = 4)
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      ....
##   Recovering initialization...
##      .........
## Initialization complete.
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -5.709) 
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -5.338, relative change = 6.486e-02) 
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -5.237, relative change = 1.893e-02) 
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -5.214, relative change = 4.519e-03) 
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -5.198, relative change = 2.955e-03) 
## Topic 1: get, wanna, time, right, night 
##  Topic 2: dance, yeah, know, love, generation 
##  Topic 3: got, love, holler, make, wanna 
##  Topic 4: la, never, give, love, time 
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -5.187, relative change = 2.100e-03) 
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -5.184, relative change = 6.849e-04) 
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -5.183, relative change = 1.905e-04) 
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 9 (approx. per word bound = -5.183, relative change = 4.775e-05) 
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 10 (approx. per word bound = -5.182, relative change = 1.627e-05) 
## Topic 1: get, wanna, time, right, night 
##  Topic 2: dance, yeah, love, know, generation 
##  Topic 3: got, love, holler, make, oh 
##  Topic 4: la, never, give, time, know 
## ...............................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged

Explore topic model results

song_topics <- tidy(topic_model, matrix = "beta") 
song_topics %>%
    group_by(topic) %>%
    slice_max(beta, n = 10) %>%
    ungroup() %>%
    mutate(topic = paste("Topic", topic)) %>%
    ggplot(aes(beta, reorder_within(term, beta, topic), fill = topic)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(vars(topic), scales = "free_y") +
    scale_y_reordered() +
    labs(x = expression(beta), y = NULL)

song_topics <- tidy(topic_model, matrix = "gamma",
                    document_names = rownames(lyrics_sparse))

song_topics %>%
    mutate(document = fct_reorder(document, gamma),
           topic = factor(topic)) %>%
    ggplot(aes(gamma, topic, fill = topic)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(vars(document), ncol = 4) +
    labs(x = expression(gamma), y = "Topic")

effects <-
    estimateEffect(
        1:4 ~ album_name,
        topic_model,
        tidy_lyrics %>% distinct(song_name, album_name) %>% arrange (song_name)
    )
summary(effects)
## 
## Call:
## estimateEffect(formula = 1:4 ~ album_name, stmobj = topic_model, 
##     metadata = tidy_lyrics %>% distinct(song_name, album_name) %>% 
##         arrange(song_name))
## 
## 
## Topic 1:
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)           0.20909    0.13360   1.565    0.129
## album_nameSpice       0.04241    0.19227   0.221    0.827
## album_nameSpiceworld  0.09401    0.18854   0.499    0.622
## 
## 
## Topic 2:
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)            0.1693     0.1334   1.269    0.215
## album_nameSpice        0.1417     0.1928   0.735    0.469
## album_nameSpiceworld   0.1199     0.1888   0.635    0.531
## 
## 
## Topic 3:
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            0.2195     0.1181   1.858   0.0737 .
## album_nameSpice        0.1000     0.1628   0.614   0.5440  
## album_nameSpiceworld  -0.1997     0.1594  -1.252   0.2208  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Topic 4:
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           0.40273    0.14412   2.794  0.00928 **
## album_nameSpice      -0.28209    0.19975  -1.412  0.16891   
## album_nameSpiceworld -0.01345    0.20230  -0.066  0.94748   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1