library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.5
## âś” forcats 1.0.0 âś” stringr 1.5.2
## âś” ggplot2 4.0.0 âś” tibble 3.3.0
## âś” lubridate 1.9.4 âś” tidyr 1.3.1
## âś” purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
tuesdata <- tidytuesdayR::tt_load(2025, week = 34)
## ---- Compiling #TidyTuesday Information for 2025-08-26 ----
## --- There are 2 files available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 2: "billboard.csv"
## 2 of 2: "topics.csv"
billboard <- tuesdata$billboard
topics <- tuesdata$topics
head(billboard)
## # A tibble: 6 Ă— 105
## song artist date weeks_at_number_one non_consecutive rating_1
## <chr> <chr> <dttm> <dbl> <dbl> <dbl>
## 1 Poor … Ricky… 1958-08-04 00:00:00 2 0 4
## 2 Nel B… Domen… 1958-08-18 00:00:00 5 1 7
## 3 Littl… The E… 1958-08-25 00:00:00 1 0 5
## 4 It's … Tommy… 1958-09-29 00:00:00 6 0 3
## 5 It's … Conwa… 1958-11-10 00:00:00 2 1 7
## 6 Tom D… The K… 1958-11-17 00:00:00 1 0 5
## # ℹ 99 more variables: rating_2 <dbl>, rating_3 <dbl>, overall_rating <dbl>,
## # divisiveness <dbl>, label <chr>, parent_label <chr>, cdr_genre <chr>,
## # cdr_style <chr>, discogs_genre <chr>, discogs_style <chr>,
## # artist_structure <dbl>, featured_artists <chr>,
## # multiple_lead_vocalists <dbl>, group_named_after_non_lead_singer <dbl>,
## # talent_contestant <chr>, posthumous <dbl>, artist_place_of_origin <chr>,
## # front_person_age <dbl>, artist_male <dbl>, artist_white <dbl>, …
head(topics)
## # A tibble: 6 Ă— 1
## lyrical_topics
## <chr>
## 1 Addiction
## 2 Anger
## 3 Appreciation
## 4 Badassery
## 5 Bad Behavior
## 6 Bad Relationships
billboard <- billboard |>
mutate(
primary_genre = str_split_i(cdr_genre, ";", 1)
)
billboard |>
select(cdr_genre, primary_genre) |>
head(30)
## # A tibble: 30 Ă— 2
## cdr_genre primary_genre
## <chr> <chr>
## 1 Pop;Rock Pop
## 2 Pop Pop
## 3 Rock Rock
## 4 Pop Pop
## 5 Pop Pop
## 6 Folk/Country Folk/Country
## 7 Pop Pop
## 8 Pop Pop
## 9 Pop Pop
## 10 Rock Rock
## # ℹ 20 more rows
genre_dance <- billboard |>
group_by(primary_genre) |>
summarise(
avg_danceability = mean(danceability, na.rm = TRUE),
n = n()
) |>
arrange(n)
genre_dance
## # A tibble: 13 Ă— 3
## primary_genre avg_danceability n
## <chr> <dbl> <int>
## 1 Blues 75 1
## 2 March 71 1
## 3 Polka 84 1
## 4 Latin 73.3 3
## 5 Jazz 61.8 5
## 6 Reggae 78.3 10
## 7 Folk/Country 57.4 25
## 8 Hip Hop 78.2 88
## 9 <NA> 66.7 88
## 10 Electronic/Dance 69.2 91
## 11 Funk/Soul 67.3 228
## 12 Rock 59.0 281
## 13 Pop 56.8 355
genre_dance <- genre_dance |>
mutate(
prob = n / sum(n),
rare = if_else(n == min(n), "Lower probability genre", "Higher probability genre")
)
genre_dance
## # A tibble: 13 Ă— 5
## primary_genre avg_danceability n prob rare
## <chr> <dbl> <int> <dbl> <chr>
## 1 Blues 75 1 0.000850 Lower probability genre
## 2 March 71 1 0.000850 Lower probability genre
## 3 Polka 84 1 0.000850 Lower probability genre
## 4 Latin 73.3 3 0.00255 Higher probability genre
## 5 Jazz 61.8 5 0.00425 Higher probability genre
## 6 Reggae 78.3 10 0.00850 Higher probability genre
## 7 Folk/Country 57.4 25 0.0212 Higher probability genre
## 8 Hip Hop 78.2 88 0.0748 Higher probability genre
## 9 <NA> 66.7 88 0.0748 Higher probability genre
## 10 Electronic/Dance 69.2 91 0.0773 Higher probability genre
## 11 Funk/Soul 67.3 228 0.194 Higher probability genre
## 12 Rock 59.0 281 0.239 Higher probability genre
## 13 Pop 56.8 355 0.302 Higher probability genre
Genres with higher average danceability that appeal to a larger audience are more likely to reach #1, which leads to a higher probability of selection among the number one songs.
ggplot(genre_dance, aes(
x = reorder(primary_genre, avg_danceability),
y = avg_danceability
)) +
geom_col(fill = "#4C72B0", width = 0.7) +
coord_flip() +
labs(
title = "Average Danceability of #1 Songs by Primary Genre",
x = "Primary Genre",
y = "Average Danceability"
)
In this instance, it is very surprising that genres like Polka and Reggae dominate the average danceability. Hip Hop in today’s world doesn’t surprise me as much, but due to how little I feel we hear about Polka or Reggae today, it is surprising how high it is for danceability overall. Another shocking find is genres like Rock and Pop rank rather low in regards to average danceability as usually these are the genres that dominate most trends and dancing.
artist_weeks <- billboard |>
group_by(artist) |>
summarise(
total_weeks_at_1 = sum(weeks_at_number_one, na.rm = TRUE),
n_songs = n()
) |>
arrange(n_songs)
artist_weeks
## # A tibble: 763 Ă— 3
## artist total_weeks_at_1 n_songs
## <chr> <dbl> <int>
## 1 24kGoldn ft. iann dior 8 1
## 2 2Pac ft. K-Ci & Jojo 2 1
## 3 50 Cent 9 1
## 4 50 Cent ft. Nate Dogg 4 1
## 5 50 Cent ft. Olivia 9 1
## 6 6ix9ine & Nicki Minaj 1 1
## 7 ? & the Mysterians 1 1
## 8 A Taste of Honey 3 1
## 9 ABBA 1 1
## 10 Aaliyah 1 1
## # ℹ 753 more rows
artist_weeks <- artist_weeks |>
mutate(
prob = n_songs / sum(n_songs),
rare = if_else(n_songs == min(n_songs), "One-time #1 artist", "Repeat #1 artist")
)
artist_weeks
## # A tibble: 763 Ă— 5
## artist total_weeks_at_1 n_songs prob rare
## <chr> <dbl> <int> <dbl> <chr>
## 1 24kGoldn ft. iann dior 8 1 0.000850 One-time #1 artist
## 2 2Pac ft. K-Ci & Jojo 2 1 0.000850 One-time #1 artist
## 3 50 Cent 9 1 0.000850 One-time #1 artist
## 4 50 Cent ft. Nate Dogg 4 1 0.000850 One-time #1 artist
## 5 50 Cent ft. Olivia 9 1 0.000850 One-time #1 artist
## 6 6ix9ine & Nicki Minaj 1 1 0.000850 One-time #1 artist
## 7 ? & the Mysterians 1 1 0.000850 One-time #1 artist
## 8 A Taste of Honey 3 1 0.000850 One-time #1 artist
## 9 ABBA 1 1 0.000850 One-time #1 artist
## 10 Aaliyah 1 1 0.000850 One-time #1 artist
## # ℹ 753 more rows
Repeat number one artists are more likely to produce multiple number one songs on account of their already established fan bases and popularity, increasing the probability that a randomly selected number one song was produced by a repeating artist.
ggplot(artist_weeks, aes(x = n_songs)) +
geom_histogram(
bins = 20,
fill = "#55A868",
color = "white"
) +
labs(
title = "Distribution of Number-One Songs per Artist",
x = "Number of #1 Songs",
y = "Count of Artists"
)
In this dataset, a vast majority of artists only reach number one once, whereas a handful of repeat artists stand at the top. As for why this is significant, I’d say so due to the fact that if I were to pick one random song from this dataset, it would be more likely to produced by a one hit artist, but the repeat number one artists have far more cumulative weeks at number one.
billboard <- billboard |>
mutate(
dance_group = cut(
danceability,
breaks = c(0, 25, 50, 75, 100),
include.lowest = TRUE,
labels = c("Low", "Medium", "High", "Very High")
)
)
dance_weeks <- billboard |>
group_by(dance_group) |>
summarise(
avg_weeks_at_1 = mean(weeks_at_number_one, na.rm = TRUE),
n = n()
) |>
arrange(n)
dance_weeks
## # A tibble: 5 Ă— 3
## dance_group avg_weeks_at_1 n
## <fct> <dbl> <int>
## 1 <NA> 1 2
## 2 Low 2.29 14
## 3 Medium 2.65 226
## 4 Very High 3.33 248
## 5 High 2.91 687
dance_weeks <- dance_weeks |>
mutate(
prob = n / sum(n),
rare = if_else(n == min(n), "Lower probability danceability group", "Higher probability danceability group")
)
dance_weeks
## # A tibble: 5 Ă— 5
## dance_group avg_weeks_at_1 n prob rare
## <fct> <dbl> <int> <dbl> <chr>
## 1 <NA> 1 2 0.00170 Lower probability danceability group
## 2 Low 2.29 14 0.0119 Higher probability danceability group
## 3 Medium 2.65 226 0.192 Higher probability danceability group
## 4 Very High 3.33 248 0.211 Higher probability danceability group
## 5 High 2.91 687 0.584 Higher probability danceability group
Songs with high and very high danceability are more common at #1 compared to low or medium danceability songs mainly because more danceable songs are more likely to reach chart-topping success due to how mainstream they are, as well as other factors, for example, social media trends in today’s age.
ggplot(dance_weeks, aes(x = dance_group, y = avg_weeks_at_1)) +
geom_col(fill = "#C44E52", width = 0.6) +
labs(
title = "Average Weeks at #1 by Danceability Level",
x = "Danceability Level",
y = "Average Weeks at #1"
)
Based on the information above, high and very high danceability songs spend the most time at number one. This is interesting to me though as the range from very high to low isn’t that large of a gap, as very high danceability sits roughly at 3.3 average weeks at number one, and low danceability is roughly around 2.2 average weeks at number one. Potentially in the future I can return to this section and examine it again with different levels of danceability to see if it changes anything. Maybe having a High and Very High level of danceability throws it off, but I’d like to test it out to see if that holds true or not.
billboard <- billboard |>
mutate(
date = as.Date(date),
year = lubridate::year(date),
decade = paste0(floor(year / 10) * 10, "s")
)
billboard |>
select(date, year, decade) |>
distinct() |>
arrange(date) |>
head()
## # A tibble: 6 Ă— 3
## date year decade
## <date> <dbl> <chr>
## 1 1958-08-04 1958 1950s
## 2 1958-08-18 1958 1950s
## 3 1958-08-25 1958 1950s
## 4 1958-09-29 1958 1950s
## 5 1958-11-10 1958 1950s
## 6 1958-11-17 1958 1950s
genre_decade <- billboard |>
count(primary_genre, decade)
genre_decade
## # A tibble: 46 Ă— 3
## primary_genre decade n
## <chr> <chr> <int>
## 1 Blues 1990s 1
## 2 Electronic/Dance 1970s 2
## 3 Electronic/Dance 1980s 31
## 4 Electronic/Dance 1990s 9
## 5 Electronic/Dance 2000s 16
## 6 Electronic/Dance 2010s 33
## 7 Folk/Country 1950s 2
## 8 Folk/Country 1960s 14
## 9 Folk/Country 1970s 8
## 10 Folk/Country 2000s 1
## # ℹ 36 more rows
all_combos <- expand_grid(
primary_genre = unique(billboard$primary_genre),
decade = unique(billboard$decade)
)
missing_combos <- anti_join(
all_combos,
genre_decade,
by = c("primary_genre", "decade")
)
missing_combos
## # A tibble: 58 Ă— 2
## primary_genre decade
## <chr> <chr>
## 1 Pop 2020s
## 2 Rock 2020s
## 3 Folk/Country 1980s
## 4 Folk/Country 1990s
## 5 Folk/Country 2010s
## 6 Folk/Country 2020s
## 7 Jazz 1980s
## 8 Jazz 1990s
## 9 Jazz 2000s
## 10 Jazz 2010s
## # ℹ 48 more rows
genre_decade |>
arrange(desc(n))
## # A tibble: 46 Ă— 3
## primary_genre decade n
## <chr> <chr> <int>
## 1 Funk/Soul 1970s 93
## 2 Rock 1980s 91
## 3 Pop 1960s 84
## 4 <NA> 2020s 82
## 5 Rock 1960s 81
## 6 Pop 1980s 79
## 7 Pop 1970s 76
## 8 Rock 1970s 72
## 9 Funk/Soul 1990s 46
## 10 Pop 1990s 45
## # ℹ 36 more rows
Certain genres regress or ascend decade to decade because music taste and trends evolve over time. As a result of this, the probability of observing the genre_decade combinations reaching number one is very low.
ggplot(genre_decade, aes(x = decade, y = n, fill = primary_genre)) +
geom_col() +
labs(
title = "Number-One Songs by Primary Genre and Decade",
x = "Decade",
y = "Count of Songs"
)
As for this visualization above, one can take away from it that some genres just never reached number one in that specific decade, and these are the genre and decade combinations that end up showing up in the missing_combos analysis. It also makes me think historically and time period wise what genres were even present decade to decade. For example, I took a Survey of Hip Hop course in my undergraduate program, and Hip Hop didn’t really emerge until the 1970s, which holds true as there is a small sliver of Hip Hop within the 1970s decade column. Overall though, just briefly viewing this visualization one can see that genres like Rock, Pop, Funk/Soul, and Hip Hop have dominated decade to decade, and some rise while some fall as time goes on.