DS 7130 Spotify Project

Author

Graham Owenby and Jared Lasley

Can you predict the next hit?

Libraries and Data

library(tidyverse)

spotify <- read_csv("/Users/graham/Documents/Grad/Courses/DS 7130/Final Project/spotify-2023.csv")
colnames(spotify)
 [1] "track_name"           "artist(s)_name"       "artist_count"        
 [4] "released_year"        "released_month"       "released_day"        
 [7] "in_spotify_playlists" "in_spotify_charts"    "streams"             
[10] "in_apple_playlists"   "in_apple_charts"      "in_deezer_playlists" 
[13] "in_deezer_charts"     "in_shazam_charts"     "bpm"                 
[16] "key"                  "mode"                 "danceability_%"      
[19] "valence_%"            "energy_%"             "acousticness_%"      
[22] "instrumentalness_%"   "liveness_%"           "speechiness_%"       
str(spotify)
spc_tbl_ [953 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ track_name          : chr [1:953] "Seven (feat. Latto) (Explicit Ver.)" "LALA" "vampire" "Cruel Summer" ...
 $ artist(s)_name      : chr [1:953] "Latto, Jung Kook" "Myke Towers" "Olivia Rodrigo" "Taylor Swift" ...
 $ artist_count        : num [1:953] 2 1 1 1 1 2 2 1 1 2 ...
 $ released_year       : num [1:953] 2023 2023 2023 2019 2023 ...
 $ released_month      : num [1:953] 7 3 6 8 5 6 3 7 5 3 ...
 $ released_day        : num [1:953] 14 23 30 23 18 1 16 7 15 17 ...
 $ in_spotify_playlists: num [1:953] 553 1474 1397 7858 3133 ...
 $ in_spotify_charts   : num [1:953] 147 48 113 100 50 91 50 43 83 44 ...
 $ streams             : chr [1:953] "141381703" "133716286" "140003974" "800840817" ...
 $ in_apple_playlists  : num [1:953] 43 48 94 116 84 67 34 25 60 49 ...
 $ in_apple_charts     : num [1:953] 263 126 207 207 133 213 222 89 210 110 ...
 $ in_deezer_playlists : num [1:953] 45 58 91 125 87 88 43 30 48 66 ...
 $ in_deezer_charts    : num [1:953] 10 14 14 12 15 17 13 13 11 13 ...
 $ in_shazam_charts    : num [1:953] 826 382 949 548 425 946 418 194 953 339 ...
 $ bpm                 : num [1:953] 125 92 138 170 144 141 148 100 130 170 ...
 $ key                 : chr [1:953] "B" "C#" "F" "A" ...
 $ mode                : chr [1:953] "Major" "Major" "Major" "Major" ...
 $ danceability_%      : num [1:953] 80 71 51 55 65 92 67 67 85 81 ...
 $ valence_%           : num [1:953] 89 61 32 58 23 66 83 26 22 56 ...
 $ energy_%            : num [1:953] 83 74 53 72 80 58 76 71 62 48 ...
 $ acousticness_%      : num [1:953] 31 7 17 11 14 19 48 37 12 21 ...
 $ instrumentalness_%  : num [1:953] 0 0 0 0 63 0 0 0 0 0 ...
 $ liveness_%          : num [1:953] 8 10 31 11 11 8 8 11 28 8 ...
 $ speechiness_%       : num [1:953] 4 4 6 15 6 24 3 4 9 33 ...
 - attr(*, "spec")=
  .. cols(
  ..   track_name = col_character(),
  ..   `artist(s)_name` = col_character(),
  ..   artist_count = col_double(),
  ..   released_year = col_double(),
  ..   released_month = col_double(),
  ..   released_day = col_double(),
  ..   in_spotify_playlists = col_double(),
  ..   in_spotify_charts = col_double(),
  ..   streams = col_character(),
  ..   in_apple_playlists = col_double(),
  ..   in_apple_charts = col_double(),
  ..   in_deezer_playlists = col_number(),
  ..   in_deezer_charts = col_double(),
  ..   in_shazam_charts = col_number(),
  ..   bpm = col_double(),
  ..   key = col_character(),
  ..   mode = col_character(),
  ..   `danceability_%` = col_double(),
  ..   `valence_%` = col_double(),
  ..   `energy_%` = col_double(),
  ..   `acousticness_%` = col_double(),
  ..   `instrumentalness_%` = col_double(),
  ..   `liveness_%` = col_double(),
  ..   `speechiness_%` = col_double()
  .. )
 - attr(*, "problems")=<externalptr> 

Establishing Predictor Dataframes

# Variable Drop
spotify_filtered <- spotify |>
  select(c(7, 8, 10:24))

# Predictor Sets
market_predictors <- spotify |>
  select(in_spotify_playlists, in_spotify_charts, in_apple_playlists, in_apple_charts,
         in_deezer_playlists, in_deezer_charts, in_shazam_charts)

production_predictors <- spotify |>
  select(bpm, key, mode, `danceability_%`, `valence_%`, `energy_%`, `acousticness_%`, 
         `instrumentalness_%`, `liveness_%`, `speechiness_%`)

Exploratory Data Analysis (EDA)

summary_stats <- function(data, var) {
  data |>
  summarize(
    n = n(),
    mean = mean(as.numeric({{var}}), na.rm = T),
    min = min(as.numeric({{var}}), na.rm = T),
    max = max(as.numeric({{var}}), na.rm = T),
    sd = sd(as.numeric({{var}}), na.rm = T),
    NAs = sum(is.na({{var}})))
}

summary_stats(spotify, streams)
# A tibble: 1 × 6
      n       mean   min        max         sd   NAs
  <int>      <dbl> <dbl>      <dbl>      <dbl> <int>
1   953 514137425.  2762 3703895074 566856949.     0

Scaling and Clustering