Explore!
library(tidyverse)
library(tidymodels)
library(spotifyr)
# Themes
library(cowplot)
library(ggthemr)
ggthemr("grass", type = "outer", layout = "scientific", spacing = 2)
## ggthemr::ggthemr_reset()
## theme_jex from Julia Silge ---------------------------------------
theme_plex <- function(base_size = 11,
strip_text_size = 12,
strip_text_margin = 5,
subtitle_size = 13,
subtitle_margin = 10,
plot_title_size = 16,
plot_title_margin = 10,
...) {
ret <- ggplot2::theme_minimal(base_family = "IBMPlexSans",
base_size = base_size, ...)
ret$strip.text <- ggplot2::element_text(
hjust = 0, size = strip_text_size,
margin = ggplot2::margin(b = strip_text_margin),
family = "IBMPlexSans-Medium"
)
ret$plot.subtitle <- ggplot2::element_text(
hjust = 0, size = subtitle_size,
margin = ggplot2::margin(b = subtitle_margin),
family = "IBMPlexSans"
)
ret$plot.title <- ggplot2::element_text(
hjust = 0, size = plot_title_size,
margin = ggplot2::margin(b = plot_title_margin),
family = "IBMPlexSans-Bold"
)
ret
}
# Data Fetch---------------------------------------------------------
rankings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-14/rankings.csv")color_sets <- c("gray60", "orange", "green", "darkred", "black")
base_plot <- rankings %>%
mutate(fill_color = sample(color_sets, nrow(rankings), replace = T)) %>%
ggplot(aes(x = year, y = points)) +
geom_jitter(aes(color = fill_color, size = log(points)), alpha = 0.7) +
labs(x = NULL, y = NULL, title = NULL) +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
legend.position = "None"
)
ggdraw() +
draw_image("https://upload.wikimedia.org/wikipedia/en/1/16/ATCQMidnightMarauders.jpg",
scale = 0.8) +
draw_plot(base_plot)## Observations: 311
## Variables: 12
## $ ID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ title <chr> "Juicy", "Fight The Power", "Shook Ones (Part II)", "The Messa…
## $ artist <chr> "The Notorious B.I.G.", "Public Enemy", "Mobb Deep", "Grandmas…
## $ year <dbl> 1994, 1989, 1995, 1982, 1992, 1993, 1993, 1992, 1994, 1995, 20…
## $ gender <chr> "male", "male", "male", "male", "male", "male", "male", "male"…
## $ points <dbl> 140, 100, 94, 90, 84, 62, 50, 48, 46, 42, 38, 36, 36, 34, 32, …
## $ n <dbl> 18, 11, 13, 14, 14, 10, 7, 6, 7, 6, 5, 5, 4, 6, 5, 5, 4, 5, 5,…
## $ n1 <dbl> 9, 7, 4, 5, 2, 3, 2, 3, 1, 2, 2, 1, 2, 1, 1, 0, 2, 2, 1, 1, 1,…
## $ n2 <dbl> 3, 3, 5, 3, 4, 1, 2, 2, 3, 1, 0, 1, 2, 0, 1, 3, 1, 0, 1, 1, 1,…
## $ n3 <dbl> 3, 1, 1, 1, 2, 1, 2, 0, 1, 1, 3, 3, 0, 2, 2, 1, 0, 1, 1, 1, 0,…
## $ n4 <dbl> 1, 0, 1, 0, 4, 4, 0, 0, 1, 2, 0, 0, 0, 3, 0, 0, 1, 0, 1, 0, 2,…
## $ n5 <dbl> 2, 0, 2, 5, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 2, 1, 2, 1,…
Where the ladies at?
rankings %>%
ggplot(aes(x = year, y = points, label = title)) +
geom_jitter(aes(color = gender), size = 2, alpha = 0.5) +
geom_text(check_overlap = T, family = "IBMPlexSans") +
annotate("rect", xmin = 1985, xmax = 1995,
ymin = 0, ymax = Inf, alpha = .2) +
annotate("text", x = 1990, y = 120, label = "Golden Age",
color = "orange", size = 5) +
ggtitle("Where the ladies at?") +
theme_plex()Get the Spotify identifiers
Sys.setenv(SPOTIFY_CLIENT_ID = '8b8441f23b6940f092af2dc8f134a2e7')
Sys.setenv(SPOTIFY_CLIENT_SECRET = 'f383b4f8279d45b4829d0e310c0eca73')
access_token <- get_spotify_access_token()
playlist_features <- get_playlist_audio_features("tmock1923",
"7esD007S7kzeSwVtcH9GFe")
playlist_features## # A tibble: 250 x 61
## playlist_id playlist_name playlist_img playlist_owner_… playlist_owner_…
## <chr> <chr> <chr> <chr> <chr>
## 1 7esD007S7k… Top 250 Hiph… https://mos… tmock1923 tmock1923
## 2 7esD007S7k… Top 250 Hiph… https://mos… tmock1923 tmock1923
## 3 7esD007S7k… Top 250 Hiph… https://mos… tmock1923 tmock1923
## 4 7esD007S7k… Top 250 Hiph… https://mos… tmock1923 tmock1923
## 5 7esD007S7k… Top 250 Hiph… https://mos… tmock1923 tmock1923
## 6 7esD007S7k… Top 250 Hiph… https://mos… tmock1923 tmock1923
## 7 7esD007S7k… Top 250 Hiph… https://mos… tmock1923 tmock1923
## 8 7esD007S7k… Top 250 Hiph… https://mos… tmock1923 tmock1923
## 9 7esD007S7k… Top 250 Hiph… https://mos… tmock1923 tmock1923
## 10 7esD007S7k… Top 250 Hiph… https://mos… tmock1923 tmock1923
## # … with 240 more rows, and 56 more variables: danceability <dbl>,
## # energy <dbl>, key <int>, loudness <dbl>, mode <int>, speechiness <dbl>,
## # acousticness <dbl>, instrumentalness <dbl>, liveness <dbl>, valence <dbl>,
## # tempo <dbl>, track.id <chr>, analysis_url <chr>, time_signature <int>,
## # added_at <chr>, is_local <lgl>, primary_color <lgl>, added_by.href <chr>,
## # added_by.id <chr>, added_by.type <chr>, added_by.uri <chr>,
## # added_by.external_urls.spotify <chr>, track.artists <list>,
## # track.available_markets <list>, track.disc_number <int>,
## # track.duration_ms <int>, track.episode <lgl>, track.explicit <lgl>,
## # track.href <chr>, track.is_local <lgl>, track.name <chr>,
## # track.popularity <int>, track.preview_url <chr>, track.track <lgl>,
## # track.track_number <int>, track.type <chr>, track.uri <chr>,
## # track.album.album_type <chr>, track.album.artists <list>,
## # track.album.available_markets <list>, track.album.href <chr>,
## # track.album.id <chr>, track.album.images <list>, track.album.name <chr>,
## # track.album.release_date <chr>, track.album.release_date_precision <chr>,
## # track.album.total_tracks <int>, track.album.type <chr>,
## # track.album.uri <chr>, track.album.external_urls.spotify <chr>,
## # track.external_ids.isrc <chr>, track.external_urls.spotify <chr>,
## # video_thumbnail.url <lgl>, key_name <chr>, mode_name <chr>, key_mode <chr>
This block of code is interesting.
- First:
There are two many versions of one song, so we only choose the one with the most popularity.
- Next:
We remove the featured artist, since it might fail our searching function. (e.g. Nuthin’ But A ‘G’ Thang; Dr Dre ft. Snoop Doggy Dogg)
- Last:
Pay attention to the map_chr function. I’ve waited so long to use the map function, it works perfect for user-identified function to vectorize. Besides, see we add possibly to wrap the second parameter. This move is to make sure when there is an error the program won’t suddendly terminate, rather it quitely fill that specific position with a pre-set value, here we use NA_character.
pull_id <- function(query) {
search_spotify(query, "track") %>%
arrange(-popularity) %>%
filter(row_number() == 1) %>%
pull(id)
}
ranking_ids <- rankings %>%
mutate(
search_query = str_to_lower(paste(title, artist)),
search_query = str_remove(search_query, "ft.*$") # Key Point
) %>%
mutate(
id = map_chr(search_query, possibly(pull_id, NA_character_)) # Key Point
)
ranking_ids %>%
select(title, artist, id)## # A tibble: 311 x 3
## title artist id
## <chr> <chr> <chr>
## 1 Juicy The Notorious B.I.G. 5ByAIlEEnxYdvpnezg7…
## 2 Fight The Power Public Enemy 1yo16b3u0lptm6Cs7lx…
## 3 Shook Ones (Part II) Mobb Deep 4nASzyRbzL5qZQuOPjQ…
## 4 The Message Grandmaster Flash & The Furious … 5DuTNKFEjJIySAyJH1y…
## 5 Nuthin’ But A ‘G’ Tha… Dr Dre ft. Snoop Doggy Dogg 4YtoipFgf4k0AfD17Zf…
## 6 C.R.E.A.M. Wu-Tang Clan 119c93MHjrDLJTApCVG…
## 7 93 ’Til Infinity Souls of Mischief 0PV1TFUMTBrDETzW6KQ…
## 8 Passin’ Me By The Pharcyde 4G3dZN9o3o2X4VKwt4C…
## 9 N.Y. State Of Mind Nas 5zwz05jkQVT68CjUpPw…
## 10 Dear Mama 2Pac 6tDxrq4FxEL2q15y37t…
## # … with 301 more rows
There are 6% of songs that we failed to find a Spodify track identifier for.
Identifier –> Audio Features
The function get_track_audio_features() can only take 100 tracks at most at once, so let’s divide up our tracks into smaller chunks and then map() through them.
# Structure
ranking_ids %>%
mutate(id_group = row_number()%/%80) %>% # Divide into 4 groups
select(id_group, id) %>% # id_gourp and id are all we need
nest(data = c(id)) # Nest into 4 tibble## # A tibble: 4 x 2
## id_group data
## <dbl> <list>
## 1 0 <tibble [79 × 1]>
## 2 1 <tibble [80 × 1]>
## 3 2 <tibble [80 × 1]>
## 4 3 <tibble [72 × 1]>
# Implementation
ranking_features <- ranking_ids %>%
mutate(id_group = row_number()%/%80) %>%
select(id_group, id) %>%
nest(data = c(id)) %>%
mutate(audio_features = map(data, ~ get_track_audio_features(.$id))) # Adding Features through map function
ranking_features## # A tibble: 4 x 3
## id_group data audio_features
## <dbl> <list> <list>
## 1 0 <tibble [79 × 1]> <tibble [79 × 18]>
## 2 1 <tibble [80 × 1]> <tibble [80 × 18]>
## 3 2 <tibble [80 × 1]> <tibble [80 × 18]>
## 4 3 <tibble [72 × 1]> <tibble [72 × 18]>
Now with the features, let’s put the rankings and features together and create a dataframe for modeling.
ranking_df <- ranking_ids %>%
bind_cols(ranking_features %>%
select(audio_features) %>%
unnest(audio_features)) %>%
select(title, artist, points, year, danceability:tempo) %>% # Filter some features, and as well the information
na.omit() # Omit the NAs at the end
ranking_df## # A tibble: 293 x 15
## title artist points year danceability energy key loudness mode
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <int>
## 1 Juicy The N… 140 1994 0.889 0.816 9 -4.67 1
## 2 Figh… Publi… 100 1989 0.797 0.582 2 -13.0 1
## 3 Shoo… Mobb … 94 1995 0.637 0.878 6 -5.51 1
## 4 The … Grand… 90 1982 0.947 0.607 10 -10.6 0
## 5 Nuth… Dr Dr… 84 1992 0.801 0.699 11 -8.18 0
## 6 C.R.… Wu-Ta… 62 1993 0.479 0.549 11 -10.6 0
## 7 93 ’… Souls… 50 1993 0.59 0.672 1 -11.8 1
## 8 Pass… The P… 48 1992 0.759 0.756 4 -8.14 0
## 9 N.Y.… Nas 46 1994 0.665 0.91 6 -4.68 0
## 10 Dear… 2Pac 42 1995 0.773 0.54 6 -7.12 1
## # … with 283 more rows, and 6 more variables: speechiness <dbl>,
## # acousticness <dbl>, instrumentalness <dbl>, liveness <dbl>, valence <dbl>,
## # tempo <dbl>
Now we see the correlations between the features.
library(corrr)
ranking_df %>%
select(year:tempo) %>%
correlate() %>% # Return the cor matrix
rearrange() %>%
shave() %>% # Remove the upper dim
rplot(shape = 15, colours = c("darkorange", "white", "darkcyan")) +
ggtitle("Louder songs have higher energy\nOlder songs are more danceable") +
theme_plex()##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.
PCA Modeling
Recipe
We have too many features! Our goal is to merge them and use the components to gather the contributive information to fit the points.
update_role(): generate ids or labels that either predictors or outcomesstep_normalize(): Center & Scale since we plan to use the PCA
# This is just a recipe
ranking_rec <- recipe(points ~., data = ranking_df) %>%
update_role(title, artist, new_role = "id") %>%
step_log(points) %>%
step_normalize(all_predictors()) %>% # Center & Scale
step_pca(all_predictors())
# Prep will get it done according to the recipe
ranking_prep <- prep(ranking_rec)
ranking_prep## Data Recipe
##
## Inputs:
##
## role #variables
## id 2
## outcome 1
## predictor 12
##
## Training data contained 293 data points and no missing data.
##
## Operations:
##
## Log transformation on points [trained]
## Centering and scaling for year, danceability, energy, key, loudness, ... [trained]
## PCA extraction with year, danceability, energy, key, loudness, ... [trained]
See the components
tidy(): tidy any of our recipe steps
We first focus on the first four components.
##
## Attaching package: 'tidytext'
## The following object is masked from 'package:spotifyr':
##
## tidy
tidied_pca <- tidy(ranking_prep, 3) # The 3rd step --> PCA
tidied_pca %>%
filter(component %in% c("PC1", "PC2", "PC3", "PC4")) %>%
group_by(component) %>%
top_n(6, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms,
abs(value),
component)
) %>% # Arrange within component
ggplot(aes(x = terms, y = abs(value),
fill = value > 0)
) +
geom_bar(stat = "identity") +
facet_wrap(~component, scales = "free_y") +
theme_plex() +
scale_x_reordered() + # Make sure you use this after reorder_within()
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
) +
coord_flip()So \(PC_{1}\) is mostly about age and danceability, \(PC_{2}\) is mostly energy and loudness, \(PC_{3}\) is mostly speechiness, and \(PC_{4}\) is about the musical characteristics (actual key and major vs. minor key).
How songs distributed in the plane of the components?
- \(PC_{1}\) VS. \(PC_{2}\)
juice(ranking_prep) %>%
ggplot(aes(PC1, PC2, label = title)) +
geom_point(alpha = 0.9, size = 2) +
geom_text(check_overlap = TRUE, family = "IBMPlexSans") +
ggtitle("PC1 vs. PC2") +
theme_plex() +
theme(plot.title = element_text(hjust = 0.5))- \(PC_{2}\) VS. \(PC_{3}\)
juice(ranking_prep) %>%
ggplot(aes(PC2, PC3, label = title)) +
geom_point(alpha = 0.9, size = 2) +
geom_text(check_overlap = TRUE, family = "IBMPlexSans") +
ggtitle("PC1 vs. PC2") +
theme_plex() +
theme(plot.title = element_text(hjust = 0.5))Components’ Contribution
sdev <- ranking_prep$steps[[3]]$res$sdev
percent_variation <- sdev^2 / sum(sdev^2)
tibble(
component = unique(tidied_pca$component),
percent_var = percent_variation ## use cumsum() to find cumulative, if you prefer
) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(component, percent_variation)) +
geom_col() +
scale_y_continuous(labels = scales::percent_format()) +
labs(x = NULL, y = NULL,
title = "Percent variance explained by each component") +
theme_plex()Fit
Use juice() to juice our train data for the model.
## # A tibble: 293 x 8
## title artist points PC1 PC2 PC3 PC4 PC5
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Juicy The Notorious B.I… 4.94 -0.987 0.904 -1.10 1.16 5.46e-1
## 2 Fight The P… Public Enemy 4.61 -0.837 -1.42 0.686 0.184 -1.78e+0
## 3 Shook Ones … Mobb Deep 4.54 0.0153 1.06 -0.929 0.681 -3.27e-1
## 4 The Message Grandmaster Flash… 4.50 -3.42 0.138 -0.0333 -0.653 -2.63e-4
## 5 Nuthin’ But… Dr Dre ft. Snoop … 4.43 -1.90 0.405 -0.629 -1.18 8.27e-2
## 6 C.R.E.A.M. Wu-Tang Clan 4.13 0.190 -2.25 -1.94 -0.245 2.80e+0
## 7 93 ’Til Inf… Souls of Mischief 3.91 0.413 -0.892 -0.576 1.93 1.20e+0
## 8 Passin’ Me … The Pharcyde 3.87 -0.990 0.289 -0.607 -0.615 -1.01e+0
## 9 N.Y. State … Nas 3.83 -0.819 1.93 -1.41 -0.736 -5.23e-1
## 10 Dear Mama 2Pac 3.74 -0.143 -0.698 1.19 0.349 -4.05e-1
## # … with 283 more rows
pca_fit <- juice(ranking_prep) %>%
select(-title, -artist) %>%
lm(points ~ ., data = .)
summary(pca_fit)##
## Call:
## lm(formula = points ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.55257 -0.58620 0.04886 0.39583 2.89017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.92516 0.04944 38.936 <2e-16 ***
## PC1 -0.07547 0.03487 -2.165 0.0312 *
## PC2 0.03540 0.03725 0.950 0.3428
## PC3 -0.07129 0.04207 -1.695 0.0912 .
## PC4 -0.03100 0.04520 -0.686 0.4934
## PC5 -0.04195 0.04749 -0.883 0.3778
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8463 on 287 degrees of freedom
## Multiple R-squared: 0.03273, Adjusted R-squared: 0.01588
## F-statistic: 1.942 on 5 and 287 DF, p-value: 0.08738
Reference
Simon Jockers, The best hip-hop songs of all time, visualized Julia Silge, PCA and the #TidyTuesday best hip hop songs ever