Analisis Exploratorio

Analisis Multivariado

Importamos la librerias necesarias

library(pacman)
p_load(stringi, tidyverse,  readr)
source('../lib/data-access.R')
source('../lib/plot.R')

Veamos las distribuciones de cada variable numérica:

Consultamos los tack featudes del top 10:

top_track_features_collection <- get_collection('track_features_top_10')
top_track_features <- top_track_features_collection$find(fields = '{
  "_id": false,
  "position": true,
  "week_start": true,
  "week_end": true,
  "reproductions": true,
  "name": true,
  "artist": true,
  "album_id": true,
  "album": true,
  "number": true,
  "disc_number": true,
  "album_release_date": true,
  "danceability": true,
  "energy": true,
  "loudness": true,
  "speechiness": true,
  "acousticness": true,
  "instrumentalness": true,
  "liveness": true,
  "valence": true,
  "explicit": true,
  "tempo": true,
  "time_signature": true,
  "duration_ms": true,
  "key": true,
  "mode": true
}')

Convertimos las fecha a tipo date para poder comparar:

top_track_features$week_start <- as.Date(
  top_track_features$week_start, 
  format="%Y-%m-%d"
)
top_track_features$week_end <- as.Date(
  top_track_features$week_start, 
  format="%Y-%m-%d"
)
str(top_track_features)
## 'data.frame':    1148 obs. of  25 variables:
##  $ position          : int  10 8 9 3 6 1 9 4 1 10 ...
##  $ week_start        : Date, format: "2018-12-28" "2019-04-26" ...
##  $ week_end          : Date, format: "2018-12-28" "2019-04-26" ...
##  $ reproductions     : int  17560600 20230172 19467987 23468009 22404832 39419339 21577655 32309199 38174455 17215322 ...
##  $ name              : chr  "Calma - Remix" "7 rings" "Soltera - Remix" "All I Want for Christmas Is You" ...
##  $ artist            : chr  "Pedro Capó" "Ariana Grande" "Bad Bunny" "Mariah Carey" ...
##  $ album_id          : chr  "1tFnP9PwIMeMIuj92mfswZ" "2fYhqwDWXjbpjaIJPEfKFw" "2m9Vuc9Q19qhSm6RQmBgsR" "61ulfFSmmxMhc2wCdmdMkN" ...
##  $ album             : chr  "Calma (Remix)" "thank u, next" "Soltera (Remix)" "Merry Christmas" ...
##  $ number            : int  1 10 1 2 1 9 1 1 10 1 ...
##  $ disc_number       : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ album_release_date: chr  "2018-10-05" "2019-02-08" "2019-05-10" "1994-11-01" ...
##  $ danceability      : num  0.826 0.778 0.795 0.336 0.785 0.835 0.863 0.621 0.778 0.571 ...
##  $ energy            : num  0.773 0.317 0.783 0.627 0.721 0.626 0.666 0.601 0.317 0.693 ...
##  $ loudness          : num  -4.22 -10.73 -4.27 -7.46 -5.46 ...
##  $ speechiness       : num  0.0524 0.334 0.0432 0.0384 0.0506 0.125 0.152 0.148 0.334 0.0545 ...
##  $ acousticness      : num  0.323 0.592 0.361 0.164 0.0149 0.0589 0.212 0.0522 0.592 0.00536 ...
##  $ instrumentalness  : num  0 0 0 0 0.00432 0.00006 0.000493 0 0 0 ...
##  $ liveness          : num  0.143 0.0881 0.437 0.0708 0.285 0.396 0.103 0.46 0.0881 0.173 ...
##  $ valence           : num  0.761 0.327 0.799 0.35 0.894 0.35 0.838 0.457 0.327 0.393 ...
##  $ explicit          : logi  FALSE TRUE FALSE FALSE TRUE TRUE ...
##  $ tempo             : num  127 140 92 150 122 ...
##  $ time_signature    : int  4 4 4 4 4 4 4 5 4 4 ...
##  $ duration_ms       : int  238200 178626 266086 241106 176218 217925 178946 163636 178626 232253 ...
##  $ key               : chr  "B" "C#" "F" "G" ...
##  $ mode              : chr  "minor" "minor" "major" "major" ...

Separamos los features numericos:

track_features.num <- top_track_features %>% select_if(is.numeric)

Normalizamos:

track_features.num.scaled <- track_features.num %>% mutate_all(scale)