library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
lyrics <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-12-14/lyrics.csv")
## Rows: 1885 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): artist_name, album_name, song_name, section_name, line, section_artist
## dbl (3): track_number, song_id, line_number
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lyrics %>% distinct(album_name)
## # A tibble: 3 × 1
## album_name
## <chr>
## 1 Spice
## 2 Spiceworld
## 3 Forever
lyrics %>% distinct(album_name, song_name)
## # A tibble: 31 × 2
## album_name song_name
## <chr> <chr>
## 1 Spice "Wannabe"
## 2 Spice "Say You\x92ll Be There"
## 3 Spice "2 Become 1"
## 4 Spice "Love Thing"
## 5 Spice "Last Time Lover"
## 6 Spice "Mama"
## 7 Spice "Who Do You Think You Are"
## 8 Spice "Something Kinda Funny"
## 9 Spice "Naked"
## 10 Spice "If U Can\x92t Dance"
## # ℹ 21 more rows
library(tidytext)
tidy_lyrics <-
lyrics %>%
mutate(song_name = str_replace_all(song_name, "\x92", "'")) %>%
unnest_tokens(word, line) %>%
anti_join(get_stopwords())
## Joining with `by = join_by(word)`
tidy_lyrics %>%
count(word, sort = TRUE)
## # A tibble: 979 × 2
## word n
## <chr> <int>
## 1 get 153
## 2 love 137
## 3 know 124
## 4 time 106
## 5 wanna 102
## 6 never 101
## 7 oh 88
## 8 yeah 88
## 9 la 85
## 10 got 82
## # ℹ 969 more rows
tidy_lyrics %>%
count(song_name, word, sort = TRUE)
## # A tibble: 2,206 × 3
## song_name word n
## <chr> <chr> <int>
## 1 Saturday Night Divas get 91
## 2 Spice Up Your Life la 64
## 3 If U Can't Dance dance 60
## 4 Holler holler 48
## 5 Never Give Up on the Good Times never 47
## 6 Move Over generation 41
## 7 Saturday Night Divas deeper 41
## 8 Move Over yeah 39
## 9 Something Kinda Funny got 39
## 10 Never Give Up on the Good Times give 38
## # ℹ 2,196 more rows
lyrics_sparse <-
tidy_lyrics %>%
count(song_name, word) %>%
cast_sparse(song_name, word, n)
dim(lyrics_sparse)
## [1] 31 979
library(stm)
## stm v1.3.6.1 successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
set.seed(123)
topic_model <- stm(lyrics_sparse, K = 4, verbose = FALSE)
summary(topic_model)
## A topic model with 4 topics, 31 documents and a 979 word dictionary.
## Topic 1 Top Words:
## Highest Prob: get, wanna, deeper, right, night, come, gotta
## FREX: deeper, saturday, get, comin, back, night, ya
## Lift: jump, party's, body, another, anyway, blameless, breaking
## Score: deeper, saturday, get, night, comin, arms, wanna
## Topic 2 Top Words:
## Highest Prob: dance, yeah, know, generation, next, love, naked
## FREX: next, naked, denying, foolin, nobody, wants, lead
## Lift: foolin, nobody, question, next, admit, bein, check
## Score: next, dance, naked, generation, denying, colour, foolin
## Topic 3 Top Words:
## Highest Prob: got, holler, make, love, wanna, oh, time
## FREX: holler, kinda, swing, funny, yay, use, driving
## Lift: anyone, driving, fantasy, oller, blow, nudge, unwind
## Score: holler, swing, kinda, funny, yay, ashamed, loving
## Topic 4 Top Words:
## Highest Prob: la, never, love, give, time, know, way
## FREX: times, swear, la, bring, promise, viva, tried
## Lift: aggravation, angel, dreamt, heaven, letting, revelation, sent
## Score: la, times, aha, swear, chicas, front, havin
word_topics <- tidy(topic_model, matrix = "beta")
word_topics
## # A tibble: 3,916 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 achieve 1.94e- 3
## 2 2 achieve 8.51e-29
## 3 3 achieve 1.00e-25
## 4 4 achieve 9.51e-19
## 5 1 baby 1.38e- 2
## 6 2 baby 1.44e- 2
## 7 3 baby 1.28e- 3
## 8 4 baby 4.16e- 3
## 9 1 back 2.31e- 2
## 10 2 back 5.44e- 4
## # ℹ 3,906 more rows
word_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
mutate(topic = paste("Topic", topic)) %>%
ggplot(aes(beta, reorder_within(term, beta, topic), fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(topic), scales = "free_y") +
scale_x_continuous(expand = c(0, 0)) +
scale_y_reordered() +
labs(x = expression(beta), y = NULL)
song_topics <- tidy(topic_model,
matrix = "gamma",
document_names = rownames(lyrics_sparse)
)
song_topics
## # A tibble: 124 × 3
## document topic gamma
## <chr> <int> <dbl>
## 1 2 Become 1 1 0.932
## 2 Denying 1 0.00154
## 3 Do It 1 0.996
## 4 Get Down With Me 1 0.300
## 5 Goodbye 1 0.000971
## 6 Holler 1 0.00155
## 7 If U Can't Dance 1 0.000896
## 8 If You Wanna Have Some Fun 1 0.0171
## 9 Last Time Lover 1 0.140
## 10 Let Love Lead the Way 1 0.00178
## # ℹ 114 more rows
song_topics %>%
mutate(
song_name = fct_reorder(document, gamma),
topic = factor(topic)
) %>%
ggplot(aes(gamma, topic, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(song_name), ncol = 4) +
scale_x_continuous(expand = c(0, 0)) +
labs(x = expression(gamma), y = "Topic")
effects <-
estimateEffect(
1:4 ~ album_name,
topic_model,
tidy_lyrics %>% distinct(song_name, album_name) %>% arrange(song_name)
)
summary(effects)
##
## Call:
## estimateEffect(formula = 1:4 ~ album_name, stmobj = topic_model,
## metadata = tidy_lyrics %>% distinct(song_name, album_name) %>%
## arrange(song_name))
##
##
## Topic 1:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.14061 0.12301 1.143 0.263
## album_nameSpice 0.09258 0.17701 0.523 0.605
## album_nameSpiceworld 0.15105 0.17327 0.872 0.391
##
##
## Topic 2:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1471 0.1327 1.109 0.277
## album_nameSpice 0.1327 0.1887 0.703 0.488
## album_nameSpiceworld 0.1472 0.1851 0.795 0.433
##
##
## Topic 3:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.29480 0.12114 2.434 0.0216 *
## album_nameSpice 0.07952 0.17174 0.463 0.6470
## album_nameSpiceworld -0.28112 0.16919 -1.662 0.1078
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Topic 4:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.41760 0.13938 2.996 0.00567 **
## album_nameSpice -0.30293 0.19793 -1.531 0.13711
## album_nameSpiceworld -0.01948 0.19352 -0.101 0.92053
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tidy(effects)
## # A tibble: 12 × 6
## topic term estimate std.error statistic p.value
## <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 (Intercept) 0.139 0.123 1.13 0.267
## 2 1 album_nameSpice 0.0926 0.177 0.525 0.604
## 3 1 album_nameSpiceworld 0.152 0.175 0.867 0.393
## 4 2 (Intercept) 0.148 0.133 1.11 0.275
## 5 2 album_nameSpice 0.132 0.187 0.703 0.488
## 6 2 album_nameSpiceworld 0.149 0.185 0.803 0.429
## 7 3 (Intercept) 0.295 0.120 2.46 0.0204
## 8 3 album_nameSpice 0.0766 0.171 0.449 0.657
## 9 3 album_nameSpiceworld -0.283 0.167 -1.69 0.102
## 10 4 (Intercept) 0.416 0.138 3.00 0.00556
## 11 4 album_nameSpice -0.303 0.192 -1.58 0.126
## 12 4 album_nameSpiceworld -0.0174 0.193 -0.0902 0.929
What is the type of model used in the analysis? The model used in analysis is a structural topic model. “stm(lyrics_sparse, K = 4, verbose = FALSE)”. This model is is a statistical model. We converted the text into sparse matrix format using a document term matrix. Document was songs and term was rows. We used cast_sparse to do this.
How is the optimal value for k found? The optimal value for k is found by choosing which value seems best. Because the data set is small, we used 4 because there was only 30 documents. You can train topic models at different values to find which have the best characteristics in order to choose the best value.