Explore!

library(tidyverse)
library(tidymodels)
library(spotifyr)
# Themes 
library(cowplot)
library(ggthemr)
ggthemr("grass", type = "outer", layout = "scientific", spacing = 2)
## ggthemr::ggthemr_reset()
## theme_jex from Julia Silge ---------------------------------------
theme_plex <- function(base_size = 11,
                       strip_text_size = 12,
                       strip_text_margin = 5,
                       subtitle_size = 13,
                       subtitle_margin = 10,
                       plot_title_size = 16,
                       plot_title_margin = 10,
                       ...) {
    ret <- ggplot2::theme_minimal(base_family = "IBMPlexSans",
                                  base_size = base_size, ...)
    ret$strip.text <- ggplot2::element_text(
        hjust = 0, size = strip_text_size,
        margin = ggplot2::margin(b = strip_text_margin),
        family = "IBMPlexSans-Medium"
    )
    ret$plot.subtitle <- ggplot2::element_text(
        hjust = 0, size = subtitle_size,
        margin = ggplot2::margin(b = subtitle_margin),
        family = "IBMPlexSans"
    )
    ret$plot.title <- ggplot2::element_text(
        hjust = 0, size = plot_title_size,
        margin = ggplot2::margin(b = plot_title_margin),
        family = "IBMPlexSans-Bold"
    )
    ret
}

# Data Fetch---------------------------------------------------------
rankings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-14/rankings.csv")

color_sets <- c("gray60", "orange", "green", "darkred", "black")
base_plot <- rankings %>%
  mutate(fill_color = sample(color_sets, nrow(rankings), replace = T)) %>%
  ggplot(aes(x = year, y = points)) +
  geom_jitter(aes(color = fill_color, size = log(points)), alpha = 0.7) +
  labs(x = NULL, y = NULL, title = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "None"
        )

ggdraw() +
  draw_image("https://upload.wikimedia.org/wikipedia/en/1/16/ATCQMidnightMarauders.jpg",
             scale = 0.8) +
  draw_plot(base_plot)

glimpse(rankings)

## Observations: 311
## Variables: 12
## $ ID     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ title  <chr> "Juicy", "Fight The Power", "Shook Ones (Part II)", "The Messa…
## $ artist <chr> "The Notorious B.I.G.", "Public Enemy", "Mobb Deep", "Grandmas…
## $ year   <dbl> 1994, 1989, 1995, 1982, 1992, 1993, 1993, 1992, 1994, 1995, 20…
## $ gender <chr> "male", "male", "male", "male", "male", "male", "male", "male"…
## $ points <dbl> 140, 100, 94, 90, 84, 62, 50, 48, 46, 42, 38, 36, 36, 34, 32, …
## $ n      <dbl> 18, 11, 13, 14, 14, 10, 7, 6, 7, 6, 5, 5, 4, 6, 5, 5, 4, 5, 5,…
## $ n1     <dbl> 9, 7, 4, 5, 2, 3, 2, 3, 1, 2, 2, 1, 2, 1, 1, 0, 2, 2, 1, 1, 1,…
## $ n2     <dbl> 3, 3, 5, 3, 4, 1, 2, 2, 3, 1, 0, 1, 2, 0, 1, 3, 1, 0, 1, 1, 1,…
## $ n3     <dbl> 3, 1, 1, 1, 2, 1, 2, 0, 1, 1, 3, 3, 0, 2, 2, 1, 0, 1, 1, 1, 0,…
## $ n4     <dbl> 1, 0, 1, 0, 4, 4, 0, 0, 1, 2, 0, 0, 0, 3, 0, 0, 1, 0, 1, 0, 2,…
## $ n5     <dbl> 2, 0, 2, 5, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 2, 1, 2, 1,…

Where the ladies at?

rankings %>%
  ggplot(aes(x = year, y = points, label = title)) +
  geom_jitter(aes(color = gender), size = 2, alpha = 0.5) +
  geom_text(check_overlap = T, family = "IBMPlexSans") +
  annotate("rect", xmin = 1985, xmax = 1995, 
                   ymin = 0, ymax = Inf, alpha = .2) +
  annotate("text", x = 1990, y = 120, label = "Golden Age", 
           color = "orange", size = 5) +
  ggtitle("Where the ladies at?") +
  theme_plex()

Get the Spotify identifiers

Sys.setenv(SPOTIFY_CLIENT_ID = '8b8441f23b6940f092af2dc8f134a2e7')
Sys.setenv(SPOTIFY_CLIENT_SECRET = 'f383b4f8279d45b4829d0e310c0eca73')
access_token <- get_spotify_access_token()
playlist_features <- get_playlist_audio_features("tmock1923",
                                                 "7esD007S7kzeSwVtcH9GFe")
playlist_features

## # A tibble: 250 x 61
##    playlist_id playlist_name playlist_img playlist_owner_… playlist_owner_…
##    <chr>       <chr>         <chr>        <chr>            <chr>           
##  1 7esD007S7k… Top 250 Hiph… https://mos… tmock1923        tmock1923       
##  2 7esD007S7k… Top 250 Hiph… https://mos… tmock1923        tmock1923       
##  3 7esD007S7k… Top 250 Hiph… https://mos… tmock1923        tmock1923       
##  4 7esD007S7k… Top 250 Hiph… https://mos… tmock1923        tmock1923       
##  5 7esD007S7k… Top 250 Hiph… https://mos… tmock1923        tmock1923       
##  6 7esD007S7k… Top 250 Hiph… https://mos… tmock1923        tmock1923       
##  7 7esD007S7k… Top 250 Hiph… https://mos… tmock1923        tmock1923       
##  8 7esD007S7k… Top 250 Hiph… https://mos… tmock1923        tmock1923       
##  9 7esD007S7k… Top 250 Hiph… https://mos… tmock1923        tmock1923       
## 10 7esD007S7k… Top 250 Hiph… https://mos… tmock1923        tmock1923       
## # … with 240 more rows, and 56 more variables: danceability <dbl>,
## #   energy <dbl>, key <int>, loudness <dbl>, mode <int>, speechiness <dbl>,
## #   acousticness <dbl>, instrumentalness <dbl>, liveness <dbl>, valence <dbl>,
## #   tempo <dbl>, track.id <chr>, analysis_url <chr>, time_signature <int>,
## #   added_at <chr>, is_local <lgl>, primary_color <lgl>, added_by.href <chr>,
## #   added_by.id <chr>, added_by.type <chr>, added_by.uri <chr>,
## #   added_by.external_urls.spotify <chr>, track.artists <list>,
## #   track.available_markets <list>, track.disc_number <int>,
## #   track.duration_ms <int>, track.episode <lgl>, track.explicit <lgl>,
## #   track.href <chr>, track.is_local <lgl>, track.name <chr>,
## #   track.popularity <int>, track.preview_url <chr>, track.track <lgl>,
## #   track.track_number <int>, track.type <chr>, track.uri <chr>,
## #   track.album.album_type <chr>, track.album.artists <list>,
## #   track.album.available_markets <list>, track.album.href <chr>,
## #   track.album.id <chr>, track.album.images <list>, track.album.name <chr>,
## #   track.album.release_date <chr>, track.album.release_date_precision <chr>,
## #   track.album.total_tracks <int>, track.album.type <chr>,
## #   track.album.uri <chr>, track.album.external_urls.spotify <chr>,
## #   track.external_ids.isrc <chr>, track.external_urls.spotify <chr>,
## #   video_thumbnail.url <lgl>, key_name <chr>, mode_name <chr>, key_mode <chr>

This block of code is interesting.

First:

There are two many versions of one song, so we only choose the one with the most popularity.

We remove the featured artist, since it might fail our searching function. (e.g. Nuthin’ But A ‘G’ Thang; Dr Dre ft. Snoop Doggy Dogg)

Last:

Pay attention to the map_chr function. I’ve waited so long to use the map function, it works perfect for user-identified function to vectorize. Besides, see we add possibly to wrap the second parameter. This move is to make sure when there is an error the program won’t suddendly terminate, rather it quitely fill that specific position with a pre-set value, here we use NA_character.

pull_id <- function(query) {
  search_spotify(query, "track") %>%
    arrange(-popularity) %>%
    filter(row_number() == 1) %>%
    pull(id)
}
ranking_ids <- rankings %>%
  mutate(
    search_query = str_to_lower(paste(title, artist)),
    search_query = str_remove(search_query, "ft.*$")             # Key Point
  ) %>%
  mutate(
    id = map_chr(search_query, possibly(pull_id, NA_character_)) # Key Point
  )

ranking_ids %>%
  select(title, artist, id)

## # A tibble: 311 x 3
##    title                  artist                            id                  
##    <chr>                  <chr>                             <chr>               
##  1 Juicy                  The Notorious B.I.G.              5ByAIlEEnxYdvpnezg7…
##  2 Fight The Power        Public Enemy                      1yo16b3u0lptm6Cs7lx…
##  3 Shook Ones (Part II)   Mobb Deep                         4nASzyRbzL5qZQuOPjQ…
##  4 The Message            Grandmaster Flash & The Furious … 5DuTNKFEjJIySAyJH1y…
##  5 Nuthin’ But A ‘G’ Tha… Dr Dre ft. Snoop Doggy Dogg       4YtoipFgf4k0AfD17Zf…
##  6 C.R.E.A.M.             Wu-Tang Clan                      119c93MHjrDLJTApCVG…
##  7 93 ’Til Infinity       Souls of Mischief                 0PV1TFUMTBrDETzW6KQ…
##  8 Passin’ Me By          The Pharcyde                      4G3dZN9o3o2X4VKwt4C…
##  9 N.Y. State Of Mind     Nas                               5zwz05jkQVT68CjUpPw…
## 10 Dear Mama              2Pac                              6tDxrq4FxEL2q15y37t…
## # … with 301 more rows

There are 6% of songs that we failed to find a Spodify track identifier for.

Identifier –> Audio Features

The function get_track_audio_features() can only take 100 tracks at most at once, so let’s divide up our tracks into smaller chunks and then map() through them.

# Structure
ranking_ids %>%
  mutate(id_group = row_number()%/%80) %>% # Divide into 4 groups
  select(id_group, id) %>%                 # id_gourp and id are all we need
  nest(data = c(id))                       # Nest into 4 tibble

## # A tibble: 4 x 2
##   id_group data             
##      <dbl> <list>           
## 1        0 <tibble [79 × 1]>
## 2        1 <tibble [80 × 1]>
## 3        2 <tibble [80 × 1]>
## 4        3 <tibble [72 × 1]>

# Implementation
ranking_features <- ranking_ids %>%
  mutate(id_group = row_number()%/%80) %>%
  select(id_group, id) %>%
  nest(data = c(id)) %>%
  mutate(audio_features = map(data, ~ get_track_audio_features(.$id))) # Adding Features through map function
ranking_features

## # A tibble: 4 x 3
##   id_group data              audio_features    
##      <dbl> <list>            <list>            
## 1        0 <tibble [79 × 1]> <tibble [79 × 18]>
## 2        1 <tibble [80 × 1]> <tibble [80 × 18]>
## 3        2 <tibble [80 × 1]> <tibble [80 × 18]>
## 4        3 <tibble [72 × 1]> <tibble [72 × 18]>

Now with the features, let’s put the rankings and features together and create a dataframe for modeling.

ranking_df <- ranking_ids %>%
  bind_cols(ranking_features %>%
              select(audio_features) %>%
              unnest(audio_features)) %>%
  select(title, artist, points, year, danceability:tempo) %>% # Filter some features, and as well the information
  na.omit() # Omit the NAs at the end
ranking_df

## # A tibble: 293 x 15
##    title artist points  year danceability energy   key loudness  mode
##    <chr> <chr>   <dbl> <dbl>        <dbl>  <dbl> <int>    <dbl> <int>
##  1 Juicy The N…    140  1994        0.889  0.816     9    -4.67     1
##  2 Figh… Publi…    100  1989        0.797  0.582     2   -13.0      1
##  3 Shoo… Mobb …     94  1995        0.637  0.878     6    -5.51     1
##  4 The … Grand…     90  1982        0.947  0.607    10   -10.6      0
##  5 Nuth… Dr Dr…     84  1992        0.801  0.699    11    -8.18     0
##  6 C.R.… Wu-Ta…     62  1993        0.479  0.549    11   -10.6      0
##  7 93 ’… Souls…     50  1993        0.59   0.672     1   -11.8      1
##  8 Pass… The P…     48  1992        0.759  0.756     4    -8.14     0
##  9 N.Y.… Nas        46  1994        0.665  0.91      6    -4.68     0
## 10 Dear… 2Pac       42  1995        0.773  0.54      6    -7.12     1
## # … with 283 more rows, and 6 more variables: speechiness <dbl>,
## #   acousticness <dbl>, instrumentalness <dbl>, liveness <dbl>, valence <dbl>,
## #   tempo <dbl>

Now we see the correlations between the features.

library(corrr)
ranking_df %>%
  select(year:tempo) %>%
  correlate() %>%         # Return the cor matrix
  rearrange() %>%
  shave() %>%             # Remove the upper dim
  rplot(shape = 15, colours = c("darkorange", "white", "darkcyan")) +
  ggtitle("Louder songs have higher energy\nOlder songs are more danceable") +
  theme_plex()

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus

## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

PCA Modeling

Recipe

We have too many features! Our goal is to merge them and use the components to gather the contributive information to fit the points.

update_role(): generate ids or labels that either predictors or outcomes
step_normalize(): Center & Scale since we plan to use the PCA

# This is just a recipe
ranking_rec <- recipe(points ~., data = ranking_df) %>%
  update_role(title, artist, new_role = "id") %>% 
  step_log(points) %>%
  step_normalize(all_predictors()) %>% # Center & Scale
  step_pca(all_predictors())
# Prep will get it done according to the recipe
ranking_prep <- prep(ranking_rec)
ranking_prep

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          2
##    outcome          1
##  predictor         12
## 
## Training data contained 293 data points and no missing data.
## 
## Operations:
## 
## Log transformation on points [trained]
## Centering and scaling for year, danceability, energy, key, loudness, ... [trained]
## PCA extraction with year, danceability, energy, key, loudness, ... [trained]

See the components

tidy(): tidy any of our recipe steps

We first focus on the first four components.

library(tidytext)

## 
## Attaching package: 'tidytext'

## The following object is masked from 'package:spotifyr':
## 
##     tidy

tidied_pca <- tidy(ranking_prep, 3) # The 3rd step --> PCA

tidied_pca %>%
  filter(component %in% c("PC1", "PC2", "PC3", "PC4")) %>%
  group_by(component) %>%
  top_n(6, abs(value)) %>%
  ungroup() %>%
  mutate(terms = reorder_within(terms, 
                                abs(value), 
                                component)
         ) %>% # Arrange within component 
  ggplot(aes(x = terms, y = abs(value), 
             fill = value > 0)
         ) +
  geom_bar(stat = "identity") +
  facet_wrap(~component, scales = "free_y") +
  theme_plex() +
  scale_x_reordered() + # Make sure you use this after reorder_within()
  labs(
    x = "Absolute value of contribution",
    y = NULL, fill = "Positive?"
  ) +
  coord_flip()

So \(PC_{1}\) is mostly about age and danceability, \(PC_{2}\) is mostly energy and loudness, \(PC_{3}\) is mostly speechiness, and \(PC_{4}\) is about the musical characteristics (actual key and major vs. minor key).

How songs distributed in the plane of the components?

\(PC_{1}\) VS. \(PC_{2}\)

juice(ranking_prep) %>%
  ggplot(aes(PC1, PC2, label = title)) +
  geom_point(alpha = 0.9, size = 2) +
  geom_text(check_overlap = TRUE, family = "IBMPlexSans") +
  ggtitle("PC1 vs. PC2") +
  theme_plex() +
  theme(plot.title = element_text(hjust = 0.5))

\(PC_{2}\) VS. \(PC_{3}\)

juice(ranking_prep) %>%
  ggplot(aes(PC2, PC3, label = title)) +
  geom_point(alpha = 0.9, size = 2) +
  geom_text(check_overlap = TRUE, family = "IBMPlexSans") +
  ggtitle("PC1 vs. PC2") +
  theme_plex() +
  theme(plot.title = element_text(hjust = 0.5))

Components’ Contribution

sdev <- ranking_prep$steps[[3]]$res$sdev

percent_variation <- sdev^2 / sum(sdev^2)

tibble(
  component = unique(tidied_pca$component),
  percent_var = percent_variation ## use cumsum() to find cumulative, if you prefer
) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(component, percent_variation)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(x = NULL, y = NULL, 
       title = "Percent variance explained by each component") +
  theme_plex()

Fit

Use juice() to juice our train data for the model.

juice(ranking_prep)

## # A tibble: 293 x 8
##    title        artist             points     PC1    PC2     PC3    PC4      PC5
##    <fct>        <fct>               <dbl>   <dbl>  <dbl>   <dbl>  <dbl>    <dbl>
##  1 Juicy        The Notorious B.I…   4.94 -0.987   0.904 -1.10    1.16   5.46e-1
##  2 Fight The P… Public Enemy         4.61 -0.837  -1.42   0.686   0.184 -1.78e+0
##  3 Shook Ones … Mobb Deep            4.54  0.0153  1.06  -0.929   0.681 -3.27e-1
##  4 The Message  Grandmaster Flash…   4.50 -3.42    0.138 -0.0333 -0.653 -2.63e-4
##  5 Nuthin’ But… Dr Dre ft. Snoop …   4.43 -1.90    0.405 -0.629  -1.18   8.27e-2
##  6 C.R.E.A.M.   Wu-Tang Clan         4.13  0.190  -2.25  -1.94   -0.245  2.80e+0
##  7 93 ’Til Inf… Souls of Mischief    3.91  0.413  -0.892 -0.576   1.93   1.20e+0
##  8 Passin’ Me … The Pharcyde         3.87 -0.990   0.289 -0.607  -0.615 -1.01e+0
##  9 N.Y. State … Nas                  3.83 -0.819   1.93  -1.41   -0.736 -5.23e-1
## 10 Dear Mama    2Pac                 3.74 -0.143  -0.698  1.19    0.349 -4.05e-1
## # … with 283 more rows

pca_fit <- juice(ranking_prep) %>%
  select(-title, -artist) %>%
  lm(points ~ ., data = .)

summary(pca_fit)

## 
## Call:
## lm(formula = points ~ ., data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.55257 -0.58620  0.04886  0.39583  2.89017 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.92516    0.04944  38.936   <2e-16 ***
## PC1         -0.07547    0.03487  -2.165   0.0312 *  
## PC2          0.03540    0.03725   0.950   0.3428    
## PC3         -0.07129    0.04207  -1.695   0.0912 .  
## PC4         -0.03100    0.04520  -0.686   0.4934    
## PC5         -0.04195    0.04749  -0.883   0.3778    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8463 on 287 degrees of freedom
## Multiple R-squared:  0.03273,    Adjusted R-squared:  0.01588 
## F-statistic: 1.942 on 5 and 287 DF,  p-value: 0.08738

Reference

Simon Jockers, The best hip-hop songs of all time, visualized Julia Silge, PCA and the #TidyTuesday best hip hop songs ever

PCA for Top 250 Hip-Hop Music

Net Zhang

2020-05-05