library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
tuesdata <- tidytuesdayR::tt_load(2025, week = 34)
## ---- Compiling #TidyTuesday Information for 2025-08-26 ----
## --- There are 2 files available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 2: "billboard.csv"
## 2 of 2: "topics.csv"
billboard <- tuesdata$billboard
topics <- tuesdata$topics
head(billboard)
## # A tibble: 6 × 105
## song artist date weeks_at_number_one non_consecutive rating_1
## <chr> <chr> <dttm> <dbl> <dbl> <dbl>
## 1 Poor … Ricky… 1958-08-04 00:00:00 2 0 4
## 2 Nel B… Domen… 1958-08-18 00:00:00 5 1 7
## 3 Littl… The E… 1958-08-25 00:00:00 1 0 5
## 4 It's … Tommy… 1958-09-29 00:00:00 6 0 3
## 5 It's … Conwa… 1958-11-10 00:00:00 2 1 7
## 6 Tom D… The K… 1958-11-17 00:00:00 1 0 5
## # ℹ 99 more variables: rating_2 <dbl>, rating_3 <dbl>, overall_rating <dbl>,
## # divisiveness <dbl>, label <chr>, parent_label <chr>, cdr_genre <chr>,
## # cdr_style <chr>, discogs_genre <chr>, discogs_style <chr>,
## # artist_structure <dbl>, featured_artists <chr>,
## # multiple_lead_vocalists <dbl>, group_named_after_non_lead_singer <dbl>,
## # talent_contestant <chr>, posthumous <dbl>, artist_place_of_origin <chr>,
## # front_person_age <dbl>, artist_male <dbl>, artist_white <dbl>, …
head(topics)
## # A tibble: 6 × 1
## lyrical_topics
## <chr>
## 1 Addiction
## 2 Anger
## 3 Appreciation
## 4 Badassery
## 5 Bad Behavior
## 6 Bad Relationships
billboard <- billboard |>
mutate(
primary_genre = str_split_i(cdr_genre, ";", 1)
)
billboard |>
select(cdr_genre, primary_genre) |>
distinct()
## # A tibble: 33 × 2
## cdr_genre primary_genre
## <chr> <chr>
## 1 Pop;Rock Pop
## 2 Pop Pop
## 3 Rock Rock
## 4 Folk/Country Folk/Country
## 5 Folk/Country;March Folk/Country
## 6 Pop;Folk/Country Pop
## 7 Jazz Jazz
## 8 Funk/Soul;Rock Funk/Soul
## 9 Polka Polka
## 10 Funk/Soul Funk/Soul
## # ℹ 23 more rows
set.seed(123)
# change this number, and consider how it affects the sub-sample analysis
sample_frac = 0.25
# number of samples to scrutinize
n_samples = 3
df_samples = tibble() # empty dataframe to append to
for (sample_i in 1:n_samples) {
df_i <- billboard |>
sample_n(size = sample_frac * nrow(billboard), replace = TRUE) |>
mutate(sample_num = sample_i) # add a column indicating sample number
df_samples = bind_rows(df_samples, df_i)
}
df_samples |>
group_by(sample_num) |>
summarise(
mean_danceability = mean(danceability, na.rm = TRUE),
median_danceability = median(danceability, na.rm = TRUE),
sd_danceability = sd(danceability, na.rm = TRUE)
)
## # A tibble: 3 × 4
## sample_num mean_danceability median_danceability sd_danceability
## <int> <dbl> <dbl> <dbl>
## 1 1 62.1 64 16.1
## 2 2 63.5 64 14.1
## 3 3 62.2 62.5 15.3
df_samples |>
group_by(sample_num, primary_genre) |>
summarise(count = n(), .groups = "drop") |>
arrange(sample_num, desc(count))
## # A tibble: 33 × 3
## sample_num primary_genre count
## <int> <chr> <int>
## 1 1 Pop 91
## 2 1 Rock 75
## 3 1 Funk/Soul 62
## 4 1 <NA> 23
## 5 1 Electronic/Dance 19
## 6 1 Hip Hop 11
## 7 1 Folk/Country 5
## 8 1 Reggae 5
## 9 1 Jazz 2
## 10 1 Latin 1
## # ℹ 23 more rows
df_samples |>
group_by(sample_num) |>
summarise(
min_danceability = min(danceability, na.rm = TRUE),
max_danceability = max(danceability, na.rm = TRUE)
)
## # A tibble: 3 × 3
## sample_num min_danceability max_danceability
## <int> <dbl> <dbl>
## 1 1 14 97
## 2 2 24 95
## 3 3 23 97
set.seed(123)
# change this number, and consider how it affects the sub-sample analysis
sample_frac = 0.10
# number of samples to scrutinize
n_samples = 3
df_samples_10 = tibble() # empty dataframe to append to
for (sample_i in 1:n_samples) {
df_i <- billboard |>
sample_n(size = sample_frac * nrow(billboard), replace = TRUE) |>
mutate(sample_num = sample_i) # add a column indicating sample number
df_samples_10 = bind_rows(df_samples_10, df_i)
}
df_samples_10 |>
group_by(sample_num) |>
summarise(
mean_danceability = mean(danceability, na.rm = TRUE),
median_danceability = median(danceability, na.rm = TRUE),
sd_danceability = sd(danceability, na.rm = TRUE)
)
## # A tibble: 3 × 4
## sample_num mean_danceability median_danceability sd_danceability
## <int> <dbl> <dbl> <dbl>
## 1 1 61.9 64 15.2
## 2 2 62.1 64 17.1
## 3 3 63.4 64 15.4
df_samples_10 |>
group_by(sample_num, primary_genre) |>
summarise(count = n(), .groups = "drop") |>
arrange(sample_num, desc(count))
## # A tibble: 27 × 3
## sample_num primary_genre count
## <int> <chr> <int>
## 1 1 Pop 41
## 2 1 Rock 25
## 3 1 Funk/Soul 24
## 4 1 <NA> 11
## 5 1 Electronic/Dance 8
## 6 1 Hip Hop 4
## 7 1 Folk/Country 2
## 8 1 Latin 1
## 9 1 Reggae 1
## 10 2 Rock 36
## # ℹ 17 more rows
df_samples_10 |>
group_by(sample_num) |>
summarise(
min_danceability = min(danceability, na.rm = TRUE),
max_danceability = max(danceability, na.rm = TRUE)
)
## # A tibble: 3 × 3
## sample_num min_danceability max_danceability
## <int> <dbl> <dbl>
## 1 1 16 97
## 2 2 14 95
## 3 3 27 95
set.seed(123)
# change this number, and consider how it affects the sub-sample analysis
sample_frac = 0.75
# number of samples to scrutinize
n_samples = 3
df_samples_75 = tibble() # empty dataframe to append to
for (sample_i in 1:n_samples) {
df_i <- billboard |>
sample_n(size = sample_frac * nrow(billboard), replace = TRUE) |>
mutate(sample_num = sample_i) # add a column indicating sample number
df_samples_75 = bind_rows(df_samples_75, df_i)
}
df_samples_75 |>
group_by(sample_num) |>
summarise(
mean_danceability = mean(danceability, na.rm = TRUE),
median_danceability = median(danceability, na.rm = TRUE),
sd_danceability = sd(danceability, na.rm = TRUE)
)
## # A tibble: 3 × 4
## sample_num mean_danceability median_danceability sd_danceability
## <int> <dbl> <dbl> <dbl>
## 1 1 62.6 64 15.2
## 2 2 62.6 64 15.9
## 3 3 62.7 64 15.4
df_samples_75 |>
group_by(sample_num, primary_genre) |>
summarise(count = n(), .groups = "drop") |>
arrange(sample_num, desc(count))
## # A tibble: 36 × 3
## sample_num primary_genre count
## <int> <chr> <int>
## 1 1 Pop 268
## 2 1 Rock 211
## 3 1 Funk/Soul 180
## 4 1 <NA> 70
## 5 1 Electronic/Dance 58
## 6 1 Hip Hop 55
## 7 1 Folk/Country 18
## 8 1 Reggae 10
## 9 1 Latin 5
## 10 1 Jazz 4
## # ℹ 26 more rows
df_samples_75 |>
group_by(sample_num) |>
summarise(
min_danceability = min(danceability, na.rm = TRUE),
max_danceability = max(danceability, na.rm = TRUE)
)
## # A tibble: 3 × 3
## sample_num min_danceability max_danceability
## <int> <dbl> <dbl>
## 1 1 14 97
## 2 2 14 98
## 3 3 14 98
The 10%, 25%, and 75% subsamples are different from one another, but just slightly based on some of the summary statistics. For the 25% subsamples, the mean danceability ranges from roughly 62.1 to 63.5, whereas the 10% subsamples show a bit more variation as the danceability means range from roughly 61.9 to 63.4. Comparing the 10% and 25% subsamples to the 75% subsamples, the 75% subsamples are extremely close to one another regarding mean danceability, where it ranges from 62.6 to 62.7. I also touched a bit on primary genre and the counts for each, and the counts varied far more noticeably in the smaller subsamples as certain genres appeared or disappeared completely between samples, but in the larger subsamples, primary genres seemed to be represented better and was more consistent.
In the smaller subsamples, some observations seem to be anomalous because of the limited context for these smaller subsamples. For example, some of the low danceability scores near the minimum and high danceability scores near the maximum obviously stand out more clearly in the 10% subsamples. On the other hand, the low/high extreme scores also come up across different subsamples and remain in the 25% and 75% samples. Similar to danceability, some of the more rare primary genres like March or Polka come up occasionally in some subsamples but not in the others, making them look unusual in smaller samples while appearing less important in the larger samples.
Even with the sampling variability, there were a few aspects that stayed consistent across the different subsamples. First, the median danceability was at or just around 64 in every sample no matter the sample size. Similarly, primary genres like Pop, Rock, and Funk/Soul consistently appeared most frequently across all subsamples. Also, the range of danceability scores was consistent as well, with the minimum scores roughly around 14-24 and the maximum scores around 95-98.
As we’ve learned in class, as the size of subsamples increases, in this instance, from 10% to 25% then to 75%, variability across the subsamples decreases. The larger subsamples provide extremely similar means, medians, and standard deviations for danceability, and the primary genre distributions become more consistent overall. As for smaller subsamples, they show a greater variation in the mean, median, and standard deviation, as well as primary genre distribution, showing a higher sensitivity to random sampling.
This investigation goes to show how conclusions derived from the Billboard Hot 100 dataset can vary dependent upon sample size and random sampling variability. As seen previously, the smaller subsamples were more sensitive to random fluctuations, in turn leading to greater variability in mean, median, and standard deviation, and the inclusion of anomalies that weren’t as persistent in the larger samples. Because of this, going forward and when drawing future conclusions, they should be made with caution when working with limited data, most importantly when attempting to determine outliers or trends. If possible in the future, further analyses should be done on the larger samples or by many sampling attempts in order to get the true characteristics of the population.