library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
lyrics <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-12-14/lyrics.csv")
## Rows: 1885 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): artist_name, album_name, song_name, section_name, line, section_artist
## dbl (3): track_number, song_id, line_number
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lyrics %>% distinct(album_name, song_name)
## # A tibble: 31 × 2
## album_name song_name
## <chr> <chr>
## 1 Spice "Wannabe"
## 2 Spice "Say You\x92ll Be There"
## 3 Spice "2 Become 1"
## 4 Spice "Love Thing"
## 5 Spice "Last Time Lover"
## 6 Spice "Mama"
## 7 Spice "Who Do You Think You Are"
## 8 Spice "Something Kinda Funny"
## 9 Spice "Naked"
## 10 Spice "If U Can\x92t Dance"
## # ℹ 21 more rows
library(tidytext)
tidy_lyrics <-
lyrics %>%
mutate(song_name = str_replace_all(song_name, "\x92", "'")) %>%
unnest_tokens(word, line) %>%
anti_join(get_stopwords())
## Joining with `by = join_by(word)`
tidy_lyrics %>%
count(word, sort = TRUE)
## # A tibble: 979 × 2
## word n
## <chr> <int>
## 1 get 153
## 2 love 137
## 3 know 124
## 4 time 106
## 5 wanna 102
## 6 never 101
## 7 oh 88
## 8 yeah 88
## 9 la 85
## 10 got 82
## # ℹ 969 more rows
tidy_lyrics %>%
count(song_name, word, sort = TRUE)
## # A tibble: 2,206 × 3
## song_name word n
## <chr> <chr> <int>
## 1 Saturday Night Divas get 91
## 2 Spice Up Your Life la 64
## 3 If U Can't Dance dance 60
## 4 Holler holler 48
## 5 Never Give Up on the Good Times never 47
## 6 Move Over generation 41
## 7 Saturday Night Divas deeper 41
## 8 Move Over yeah 39
## 9 Something Kinda Funny got 39
## 10 Never Give Up on the Good Times give 38
## # ℹ 2,196 more rows
lyrics_sparse <-
tidy_lyrics %>%
count(song_name, word) %>%
cast_sparse(song_name, word, n)
dim(lyrics_sparse)
## [1] 31 979
library(stm)
## stm v1.3.6.1 successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
set.seed(123)
topic_model <- stm(lyrics_sparse, K = 4, verbose = FALSE)
summary(topic_model)
## A topic model with 4 topics, 31 documents and a 979 word dictionary.
## Topic 1 Top Words:
## Highest Prob: get, wanna, deeper, right, night, come, gotta
## FREX: deeper, saturday, get, comin, back, night, ya
## Lift: jump, party's, body, another, anyway, blameless, breaking
## Score: deeper, saturday, get, night, comin, arms, wanna
## Topic 2 Top Words:
## Highest Prob: dance, yeah, know, generation, next, love, naked
## FREX: next, naked, denying, foolin, nobody, wants, lead
## Lift: foolin, nobody, question, next, admit, bein, check
## Score: next, dance, naked, generation, denying, colour, foolin
## Topic 3 Top Words:
## Highest Prob: got, holler, make, love, wanna, oh, time
## FREX: holler, kinda, swing, funny, yay, use, driving
## Lift: anyone, driving, fantasy, oller, blow, nudge, unwind
## Score: holler, swing, kinda, funny, yay, ashamed, loving
## Topic 4 Top Words:
## Highest Prob: la, never, love, give, time, know, way
## FREX: times, swear, la, bring, promise, viva, tried
## Lift: aggravation, angel, dreamt, heaven, letting, revelation, sent
## Score: la, times, aha, swear, chicas, front, havin
word_topics <- tidy(topic_model, matrix = "beta")
word_topics
## # A tibble: 3,916 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 achieve 1.94e- 3
## 2 2 achieve 8.51e-29
## 3 3 achieve 1.00e-25
## 4 4 achieve 9.51e-19
## 5 1 baby 1.38e- 2
## 6 2 baby 1.44e- 2
## 7 3 baby 1.28e- 3
## 8 4 baby 4.16e- 3
## 9 1 back 2.31e- 2
## 10 2 back 5.44e- 4
## # ℹ 3,906 more rows
word_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
mutate(topic = paste("Topic", topic)) %>%
ggplot(aes(beta, reorder_within(term, beta, topic), fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(topic), scales = "free_y") +
scale_x_continuous(expand = c(0, 0)) +
scale_y_reordered() +
labs(x = expression(beta), y = NULL)
song_topics <- tidy(topic_model,
matrix = "gamma",
document_names = rownames(lyrics_sparse)
)
song_topics
## # A tibble: 124 × 3
## document topic gamma
## <chr> <int> <dbl>
## 1 2 Become 1 1 0.932
## 2 Denying 1 0.00154
## 3 Do It 1 0.996
## 4 Get Down With Me 1 0.300
## 5 Goodbye 1 0.000971
## 6 Holler 1 0.00155
## 7 If U Can't Dance 1 0.000896
## 8 If You Wanna Have Some Fun 1 0.0171
## 9 Last Time Lover 1 0.140
## 10 Let Love Lead the Way 1 0.00178
## # ℹ 114 more rows
song_topics %>%
mutate(
song_name = fct_reorder(document, gamma),
topic = factor(topic)
) %>%
ggplot(aes(gamma, topic, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(song_name), ncol = 4) +
scale_x_continuous(expand = c(0, 0)) +
labs(x = expression(gamma), y = "Topic")
effects <-
estimateEffect(
1:4 ~ album_name,
topic_model,
tidy_lyrics %>% distinct(song_name, album_name) %>% arrange(song_name)
)
summary(effects)
##
## Call:
## estimateEffect(formula = 1:4 ~ album_name, stmobj = topic_model,
## metadata = tidy_lyrics %>% distinct(song_name, album_name) %>%
## arrange(song_name))
##
##
## Topic 1:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.14061 0.12301 1.143 0.263
## album_nameSpice 0.09258 0.17701 0.523 0.605
## album_nameSpiceworld 0.15105 0.17327 0.872 0.391
##
##
## Topic 2:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1471 0.1327 1.109 0.277
## album_nameSpice 0.1327 0.1887 0.703 0.488
## album_nameSpiceworld 0.1472 0.1851 0.795 0.433
##
##
## Topic 3:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.29480 0.12114 2.434 0.0216 *
## album_nameSpice 0.07952 0.17174 0.463 0.6470
## album_nameSpiceworld -0.28112 0.16919 -1.662 0.1078
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Topic 4:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.41760 0.13938 2.996 0.00567 **
## album_nameSpice -0.30293 0.19793 -1.531 0.13711
## album_nameSpiceworld -0.01948 0.19352 -0.101 0.92053
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tidy(effects)
## # A tibble: 12 × 6
## topic term estimate std.error statistic p.value
## <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 (Intercept) 0.139 0.123 1.13 0.267
## 2 1 album_nameSpice 0.0926 0.177 0.525 0.604
## 3 1 album_nameSpiceworld 0.152 0.175 0.867 0.393
## 4 2 (Intercept) 0.148 0.133 1.11 0.275
## 5 2 album_nameSpice 0.132 0.187 0.703 0.488
## 6 2 album_nameSpiceworld 0.149 0.185 0.803 0.429
## 7 3 (Intercept) 0.295 0.120 2.46 0.0204
## 8 3 album_nameSpice 0.0766 0.171 0.449 0.657
## 9 3 album_nameSpiceworld -0.283 0.167 -1.69 0.102
## 10 4 (Intercept) 0.416 0.138 3.00 0.00556
## 11 4 album_nameSpice -0.303 0.192 -1.58 0.126
## 12 4 album_nameSpiceworld -0.0174 0.193 -0.0902 0.929
#1 - The modeling goal was to use unsupervised modeling to discover topics in the lyrics of Spice Girls songs. -The data consists of the artist, album, song, line etc. - The main variables used in the analysis consist of the line, broken into words, song name.
#2 - The original data has been changed by tokenizing the text and
removing stop words. This allows the data to be looked at by each word
without filler words like “the” and “and”.
#3 - topic model - To find optimal K value you can try multiple values
to find the best value for K.
#4 - We were able to determine that there is no statistical evidence of change in the lyrics of spice girls songs across the albums. .