#remotes::install_github("johncassil/stringr.plus")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Data

Primeiro você precisa solicitar as suas informações neste site download-your-data , para mais informações sobre as bases disponibilizadas e as respectivas informações, understanding-my-data. E como não são dadas muitas descrições, achei este link que me ajudou a entender melhor as variáveis e seus respectivos níveis (particularmente o reason_start e reason_end: hack_dados_spotifhack_dados_spotify .

historical data

Fiz o download dos dados, salvei em uma pasta, e aí :

#list all JSON files from directory
files_names <- list.files("my_spotify_historical_data/", pattern = "*.json", full.names = T)
  #tentei usar o pacote `fs`, não conhecia, algo não deu certo, e segui sem ele:
  #files_names <- fs::dir_ls(path = "my_spotify_historical_data/", glob = "*.json")


#read them all  
data_list <- files_names %>% map(~jsonlite::fromJSON(.))

#concatenate by row
spotify_full_raw <- data_list %>% bind_rows() 

  #pareceu uma alternativa interessante, mas fica para a Nath do futuro fuçar
  # tibble(file_names = files_names) %>% 
  #   mutate(data = map(files_names, read_csv)) %>% 
  #   unnest()

spotify_full_raw %>% glimpse()
## Rows: 177,255
## Columns: 21
## $ ts                                <chr> "2014-06-09T03:49:48Z", "2014-06-09T…
## $ username                          <chr> "12143382095", "12143382095", "12143…
## $ platform                          <chr> "iOS 7.1.1 (iPad2,5)", "iOS 7.1.1 (i…
## $ ms_played                         <int> 211086, 194079, 231546, 201506, 1885…
## $ conn_country                      <chr> "BR", "BR", "BR", "BR", "BR", "BR", …
## $ ip_addr_decrypted                 <chr> "177.148.215.115", "177.148.215.115"…
## $ user_agent_decrypted              <chr> "unknown", "unknown", "unknown", "un…
## $ master_metadata_track_name        <chr> "Sober", "Money On My Mind", "Sweet …
## $ master_metadata_album_artist_name <chr> "Elli Ingram", "Sam Smith", "The Tem…
## $ master_metadata_album_album_name  <chr> "Sober EP", "Money On My Mind", "Con…
## $ spotify_track_uri                 <chr> "spotify:track:4ymQoZNLQpis51EmcgAoN…
## $ episode_name                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ episode_show_name                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ spotify_episode_uri               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ reason_start                      <chr> "", "trackdone", "trackdone", "track…
## $ reason_end                        <chr> "trackdone", "trackdone", "trackdone…
## $ shuffle                           <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE,…
## $ skipped                           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
## $ offline                           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
## $ offline_timestamp                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ incognito_mode                    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…

Infos Description

Dataprep

spotify_full <- spotify_full_raw %>% #glimpse()
  select(-username, -platform, -ip_addr_decrypted, 
         -user_agent_decrypted, -spotify_track_uri,  -spotify_episode_uri) %>% 
  mutate(ts = lubridate::as_datetime(ts, format="%Y-%m-%dT%H:%M:%SZ")) %>% 
  mutate(ts_date = lubridate::date(ts)) %>% 
  mutate(ts_year = lubridate::year(ts)) %>% 
  mutate(min_played = round(ms_played*1.7*10^(-5))) %>% 
  relocate(min_played, .after = ms_played) %>% 
  rename(track_name = master_metadata_track_name) %>% 
  rename(artist_name = master_metadata_album_artist_name) %>% 
  rename(album_name = master_metadata_album_album_name) %>% 
  tibble() %>% 
  glimpse()
## Rows: 177,255
## Columns: 18
## $ ts                <dttm> 2014-06-09 03:49:48, 2014-06-09 03:53:02, 2014-06-0…
## $ ms_played         <int> 211086, 194079, 231546, 201506, 188586, 257386, 2074…
## $ min_played        <dbl> 4, 3, 4, 3, 3, 4, 4, 4, 3, 3, 4, 4, 4, 0, 1, 1, 0, 0…
## $ conn_country      <chr> "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR"…
## $ track_name        <chr> "Sober", "Money On My Mind", "Sweet Disposition", "H…
## $ artist_name       <chr> "Elli Ingram", "Sam Smith", "The Temper Trap", "Hot …
## $ album_name        <chr> "Sober EP", "Money On My Mind", "Conditions", "Whate…
## $ episode_name      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ episode_show_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ reason_start      <chr> "", "trackdone", "trackdone", "trackdone", "trackdon…
## $ reason_end        <chr> "trackdone", "trackdone", "trackdone", "trackdone", …
## $ shuffle           <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
## $ skipped           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ offline           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ offline_timestamp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ incognito_mode    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ ts_date           <date> 2014-06-09, 2014-06-09, 2014-06-09, 2014-06-09, 201…
## $ ts_year           <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014…

Descriptions

ts_year

spotify_full %>%janitor::tabyl(ts_year)
##  ts_year     n     percent
##     2014   210 0.001184734
##     2016 16799 0.094773067
##     2017 39737 0.224179854
##     2018 47648 0.268810471
##     2019 32678 0.184355871
##     2020 15130 0.085357254
##     2021  9767 0.055101408
##     2022  9921 0.055970212
##     2023  5365 0.030267129

parece que tem um “pulo” do ano de 2015, mas não sei dizer se isso corresponde com a realidade. Não lembro!

spotify_full %>%
  filter(between(ts_year, 2014, 2016)) %>%  
  janitor::tabyl(ts_date) %>% 
  head(n=15)
##     ts_date  n      percent
##  2014-06-09 28 0.0016461873
##  2014-06-14 27 0.0015873949
##  2014-06-15 14 0.0008230937
##  2014-06-16 11 0.0006467164
##  2014-06-19 15 0.0008818861
##  2014-06-20 27 0.0015873949
##  2014-06-22 14 0.0008230937
##  2014-06-23  5 0.0002939620
##  2014-07-02  9 0.0005291316
##  2014-07-07 10 0.0005879240
##  2014-07-08  4 0.0002351696
##  2014-08-09  8 0.0004703392
##  2014-09-12  3 0.0001763772
##  2014-10-25 35 0.0020577341
##  2016-04-19  1 0.0000587924

conn_country

spotify_full %>% janitor::tabyl(conn_country) %>% arrange(-n)
##  conn_country      n      percent
##            BR 169911 0.9585681645
##            PT   4475 0.0252461144
##            ZZ   1557 0.0087839553
##            US    553 0.0031197992
##            DE    299 0.0016868354
##            IE    218 0.0012298666
##            AT    207 0.0011678091
##            NL     35 0.0001974556

tem inconsistências: GPS não é 100% confiável

  • só fui uma vez em US, 2018, tem uma linha de 2019!
spotify_full %>% filter(conn_country == "US") %>% 
  janitor::tabyl(ts_year) %>% arrange(ts_year)
##  ts_year   n    percent
##     2018 540 0.97649186
##     2019  13 0.02350814
  • ZZ é o código para país não identificado, e aparecem alguns assim ao longo dos anos
spotify_full %>% 
  filter(conn_country == "ZZ") %>% 
  janitor::tabyl(ts_year) %>% 
  arrange(ts_year)
##  ts_year   n    percent
##     2017 710 0.45600514
##     2018 717 0.46050096
##     2019 117 0.07514451
##     2020  13 0.00834939
  • nunca fui na Irlanda (IE)!
spotify_full %>% 
  filter(conn_country == "IE") %>% 
  janitor::tabyl(ts_date) %>% 
  arrange(ts_date)
##     ts_date   n     percent
##  2020-02-10 100 0.458715596
##  2020-04-05   3 0.013761468
##  2020-08-04   1 0.004587156
##  2020-08-11   3 0.013761468
##  2020-08-17   8 0.036697248
##  2022-10-19   5 0.022935780
##  2022-10-20  70 0.321100917
##  2022-10-22  28 0.128440367
  • mas note que 95% dos dados são “BR”! Então tá tudo certo ;)

episode

spotify_full %>% 
  group_by(episode_show_name) %>%
  count() %>% 
  arrange(-n)
## # A tibble: 34 × 2
## # Groups:   episode_show_name [34]
##    episode_show_name                                                       n
##    <chr>                                                               <int>
##  1 <NA>                                                               177065
##  2 GunCast | Criatividade e Inovação                                      56
##  3 Mario Sergio Cortella - No Meio do Caminho - Mario Sergio Cortella     37
##  4 Mamilos                                                                15
##  5 Data Hackers                                                           12
##  6 Elas Programam                                                          6
##  7 Pizza de Dados                                                          6
##  8 Spotify                                                                 6
##  9 Inédita Pamonha                                                         5
## 10 AMOR E SEXO, contos eróticos narrados                                   4
## # ℹ 24 more rows

claramente eu não sou uma pessoa de podcast…

spotify_full %>% 
  mutate(episode_NA = ifelse(is.na(episode_show_name), "nao-podcast", "podcast")) %>% 
  #group_by(episode_NA) %>%
  janitor::tabyl(episode_NA) 
##   episode_NA      n     percent
##  nao-podcast 177065 0.998928098
##      podcast    190 0.001071902

reason_start

Considerando as descrições abaixo, o “clickrow” parece ser interessante, pois indica o interesse na faixa em específico, o “playbtn” também, mas aqui há uma mudança de lista, de estilo. Mas entendo que as duas têm uma natureza similar. O “backbtn” parece bem interessante também, pois indica o comportamento de “repeat”.

  • “trackdone” - The previous track played to its end and this was the next track to play
  • “fwdbtn” - The user pressed Next/Forward and this was the next track to play
  • “clickrow” - The user pressed a specific track in a list of tracks.
  • “backbtn” - The user pressed Back and this was the previous track to play
  • “playbtn” - The user pressed Play on a playlist or other list of tracks
  • “unknown” - The reason why the user (or the app) started playing this track is unknown
  • “trackerror” - The playback of the previous track ran into some form of error, and this was the next track to play
  • “remote” - The track was startedon another device and then transferred to this device.
spotify_full %>% 
  janitor::tabyl(reason_start) %>% 
  janitor::adorn_pct_formatting() %>% 
  arrange(-n)
##  reason_start      n percent
##     trackdone 111533   62.9%
##        fwdbtn  45318   25.6%
##      clickrow   8781    5.0%
##       backbtn   4667    2.6%
##       appload   2471    1.4%
##       playbtn   2432    1.4%
##       unknown   1171    0.7%
##    trackerror    613    0.3%
##        remote    227    0.1%
##                   42    0.0%

reason_end

Pelas descrições que seguem, o “trackdone” talvez seja um filtro importante para aplicar em toda a base a depender do objetivo. Já as músicas “fwdbtn” indicam uma falta de interesse direta, e o “endplay” um desinteresse indireto talvez? Na contramão, “backbtn”, assim como o `reason_start`, é sobre o comportamento de “repeat”.

  • “trackdone” - The track played to its end
  • “fwdbtn” - The user pressed Next/Forward
  • “endplay” - The user started playing something else (by clicking a track or Play/Shuffle buttonor similar)
  • “backbtn” - The user pressed Back
  • “logout” - The app was shut down or theuser logged out. “remote” - The track playback was moved to another device.
  • “unexpected-exit” - The app was shut down eitherby the user, or by the operating system, or it crashed.
  • “unexpected-exit-while-paused” - Theplayback was paused, and at some point after that, the app was shut down either by the user,or by the operating system, or it crashed.
  • “trackerror” - The playback of theprevious track ran into some form of error, and this was the next track to play
  • “remote” - The track playback was moved to another device
  • “clickrow” - The userpressed a specific track in a list of tracks.
spotify_full %>% 
  janitor::tabyl(reason_end) %>% 
  janitor::adorn_pct_formatting() %>% 
  arrange(-n)
##                    reason_end      n percent
##                     trackdone 112251   63.3%
##                        fwdbtn  45618   25.7%
##                       endplay   7840    4.4%
##                       backbtn   4704    2.7%
##                        logout   3881    2.2%
##                       unknown   1351    0.8%
##  unexpected-exit-while-paused   1181    0.7%
##               unexpected-exit    159    0.1%
##                    trackerror    135    0.1%
##                        remote    108    0.1%
##                                   26    0.0%
##                      clickrow      1    0.0%

skipped

info com problema de missing local

Estava super animada para avaliar essa info, mas parece que ela não foi devidamente registrada ao longo de todo o histórico :( Enviei um e-mail para o spotify para entender melhor sobre!

spotify_full %>% janitor::tabyl(skipped) %>% arrange(-n)
##  skipped      n    percent valid_percent
##       NA 168941 0.95309582            NA
##    FALSE   5396 0.03044202     0.6490257
##     TRUE   2918 0.01646216     0.3509743
spotify_full %>% janitor::tabyl(skipped, ts_year) 
##  skipped 2014  2016  2017  2018  2019  2020 2021 2022 2023
##    FALSE  133     0     3     0     0     0    0 1624 3636
##     TRUE   77     0     0     0     0     0    0 1112 1729
##       NA    0 16799 39734 47648 32678 15130 9767 7185    0
spotify_full %>% 
  janitor::tabyl(reason_end, skipped) %>% 
  janitor::adorn_totals("col") %>% 
  arrange(-Total)
##                    reason_end FALSE TRUE    NA_  Total
##                     trackdone  5014    0 107237 112251
##                        fwdbtn     0 2154  43464  45618
##                       endplay     0  517   7323   7840
##                       backbtn     0  221   4483   4704
##                        logout   337    0   3544   3881
##                       unknown     0    0   1351   1351
##  unexpected-exit-while-paused    18    0   1163   1181
##               unexpected-exit    14    0    145    159
##                    trackerror     0    0    135    135
##                        remote    13    0     95    108
##                                   0   26      0     26
##                      clickrow     0    0      1      1

incognito_mode

Não sei bem o que é, mas não tem dados suficientes para que vala a pena se preocupar…

spotify_full %>% janitor::tabyl(incognito_mode) %>% arrange(-n)
##  incognito_mode      n      percent
##           FALSE 177251 9.999774e-01
##            TRUE      4 2.256636e-05

Artist vs. track vs. min_played

spotify_full %>%  
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  ungroup() %>% 
  slice_max(min_played, n = 100) %>% 
  DT::datatable()
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.

album_name

spotify_full %>% 
  group_by(album_name, artist_name) %>% 
  count() %>% 
  arrange(-n) %>% 
  ungroup() %>% 
  slice_head(n = 100) %>% 
  DT::datatable()

artist_name

spotify_full %>% 
  group_by(artist_name) %>% 
  count() %>% 
  arrange(-n)
## # A tibble: 6,326 × 2
## # Groups:   artist_name [6,326]
##    artist_name          n
##    <chr>            <int>
##  1 P!nk              3265
##  2 James Morrison    2765
##  3 Queen             2617
##  4 Marília Mendonça  2551
##  5 John Mayer        1678
##  6 Cássia Eller      1586
##  7 Maroon 5          1500
##  8 Boyce Avenue      1491
##  9 TIAGO IORC        1377
## 10 Marisa Monte      1285
## # ℹ 6,316 more rows
spotify_full %>% 
  group_by(artist_name) %>% 
  count() %>% 
  ungroup() %>% 
  skimr::skim() 
Data summary
Name Piped data
Number of rows 6326
Number of columns 2
_______________________
Column type frequency:
character 1
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
artist_name 1 1 2 91 0 6325 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
n 0 1 28.02 122.42 1 1 3 10 3265 ▇▁▁▁▁

track_name

spotify_full %>% 
  group_by(track_name, artist_name) %>% 
  count() %>% 
  arrange(-n)
## # A tibble: 22,268 × 3
## # Groups:   track_name, artist_name [22,268]
##    track_name                               artist_name        n
##    <chr>                                    <chr>          <int>
##  1 Please Don't Stop The Rain               James Morrison   431
##  2 You Give Me Something - (Live fromTokyo) James Morrison   356
##  3 Drops of Jupiter (Tell Me)               Train            343
##  4 Mr. Brightside                           The Killers      338
##  5 Viva                                     Zimbra           319
##  6 Classic                                  MKTO             314
##  7 Resposta                                 Skank            310
##  8 Never Gonna Let You Down                 Colbie Caillat   308
##  9 Sorry Not Sorry - Acoustic               Demi Lovato      307
## 10 Bubbly                                   Colbie Caillat   305
## # ℹ 22,258 more rows

Qtd de minutos tocados por artistName

spotify_full %>% 
  group_by(artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))
## # A tibble: 6,326 × 3
##    artist_name      min_played       p
##    <chr>                 <dbl>   <dbl>
##  1 P!nk                  10808 0.0229 
##  2 James Morrison         9990 0.0212 
##  3 Queen                  9216 0.0196 
##  4 Marília Mendonça       6451 0.0137 
##  5 Maroon 5               4947 0.0105 
##  6 John Mayer             4913 0.0104 
##  7 Cássia Eller           4342 0.00921
##  8 Boyce Avenue           4301 0.00913
##  9 Lady Gaga              3960 0.00840
## 10 U2                     3906 0.00829
## # ℹ 6,316 more rows
spotify_full %>% 
  group_by(artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 6326
Number of columns 2
_______________________
Column type frequency:
character 1
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
artist_name 1 1 2 91 0 6325 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
min_played 0 1 74.5 365.82 0 3 6 24 10808 ▇▁▁▁▁

vs. shuffle

spotify_full %>% 
  filter(shuffle == FALSE) %>% 
  group_by(artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))
## # A tibble: 3,658 × 3
##    artist_name      min_played       p
##    <chr>                 <dbl>   <dbl>
##  1 James Morrison         3138 0.0210 
##  2 <NA>                   2163 0.0145 
##  3 P!nk                   2107 0.0141 
##  4 John Mayer             2067 0.0139 
##  5 Marília Mendonça       2022 0.0136 
##  6 Cássia Eller           1777 0.0119 
##  7 Boyce Avenue           1643 0.0110 
##  8 Maria Gadú             1535 0.0103 
##  9 Elvis Presley          1510 0.0101 
## 10 Colbie Caillat         1482 0.00994
## # ℹ 3,648 more rows

vs. reason_start

  • “playbtn” - The user pressed Play on a playlist or other list of tracks
  • “clickrow” - The user pressed a specific track in a list of tracks.
spotify_full %>% 
  filter(reason_start %in% c("playbtn","clickrow")) %>% 
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.
## # A tibble: 4,855 × 4
## # Groups:   track_name [4,530]
##    track_name                                       artist_name min_played     p
##    <chr>                                            <chr>            <dbl> <dbl>
##  1 <NA>                                             <NA>              1386     1
##  2 Please Don't Stop The Rain                       James Morr…        215     1
##  3 Girls Like You (feat. Cardi B) - Cardi B Version Maroon 5           212     1
##  4 Just A Fool                                      Christina …        204     1
##  5 Beautiful Trauma                                 P!nk               185     1
##  6 Sorry Not Sorry - Acoustic                       Demi Lovato        181     1
##  7 You Make It Real                                 James Morr…        127     1
##  8 Shallow - Radio Edit                             Lady Gaga          121     1
##  9 Say It All Over Again                            James Morr…        118     1
## 10 Viva                                             Zimbra             114     1
## # ℹ 4,845 more rows
  • “backbtn” - The user pressed Back and this was the previous track to play
spotify_full %>% 
  filter(reason_start %in% c("backbtn")) %>% 
  group_by(artist_name, track_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))
## `summarise()` has grouped output by 'artist_name'. You can override using the
## `.groups` argument.
## # A tibble: 2,083 × 4
## # Groups:   artist_name [1,009]
##    artist_name        track_name                                min_played     p
##    <chr>              <chr>                                          <dbl> <dbl>
##  1 Zimbra             Viva                                             192 0.980
##  2 James Morrison     Please Don't Stop The Rain                       153 0.384
##  3 Christina Aguilera Just A Fool                                      149 0.882
##  4 Luciana Mello      Tchau                                            146 1    
##  5 Lady Gaga          Shallow - Radio Edit                             138 0.817
##  6 P!nk               Beautiful Trauma                                 129 0.502
##  7 Zac Efron          Rewrite The Stars                                129 1    
##  8 Gustavo Trebien    Apenas Mais Uma De Amor - The Voice Bras…        114 0.983
##  9 Redbone            Come and Get Your Love - Single Version          109 1    
## 10 Maroon 5           Girls Like You (feat. Cardi B) - Cardi B…        105 0.766
## # ℹ 2,073 more rows

vs. reason_end

  • “trackdone” - The track played to its end
spotify_full %>% 
  filter(reason_end %in% c("trackdone")) %>% 
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.
## # A tibble: 16,030 × 4
## # Groups:   track_name [14,296]
##    track_name                               artist_name        min_played     p
##    <chr>                                    <chr>                   <dbl> <dbl>
##  1 Please Don't Stop The Rain               James Morrison           1457 1    
##  2 Viva                                     Zimbra                   1253 1    
##  3 Drops of Jupiter (Tell Me)               Train                    1158 1    
##  4 You Give Me Something - (Live fromTokyo) James Morrison           1147 1    
##  5 Almost Is Never Enough                   Ariana Grande            1145 1    
##  6 Beautiful Trauma                         P!nk                     1081 1    
##  7 Mr. Brightside                           The Killers              1049 1    
##  8 Just A Fool                              Christina Aguilera       1040 1    
##  9 You Make It Real                         James Morrison            986 1    
## 10 A Million Dreams                         P!nk                      981 0.754
## # ℹ 16,020 more rows
  • “fwdbtn” - The user pressed Next/Forward
  • “endplay” - The user started playing something else (by clicking a track or Play/Shuffle buttonor similar)
spotify_full %>% 
  filter(reason_end %in% c("fwdbtn", "endplay")) %>% 
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.
## # A tibble: 13,844 × 4
## # Groups:   track_name [12,570]
##    track_name                                       artist_name min_played     p
##    <chr>                                            <chr>            <dbl> <dbl>
##  1  <NA>                                            <NA>               419 1    
##  2 "Hey Jude - Remastered 2015"                     The Beatles         96 1    
##  3 "Don't Go Away"                                  Oasis               88 0.978
##  4 "Paciência"                                      Lenine              87 1    
##  5 "Sinônimos (Ao Vivo)"                            Zé Ramalho          87 1    
##  6 "Sweet Child O' Mine"                            Taken By T…         87 0.547
##  7 "Can You Feel The Love Tonight/Nants' Ingonyama… Jason Deru…         85 1    
##  8 "Fácil"                                          Jota Quest          85 1    
##  9 "Quando Fui Chuva - Ao Vivo"                     Maria Gadú          82 1    
## 10 "Never Gonna Let You Down"                       Colbie Cai…         81 1    
## # ℹ 13,834 more rows
  • “backbtn” - The user pressed Back
spotify_full %>% 
  filter(reason_end %in% c("backbtn")) %>% 
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.
## # A tibble: 2,281 × 4
## # Groups:   track_name [2,186]
##    track_name                             artist_name     min_played     p
##    <chr>                                  <chr>                <dbl> <dbl>
##  1 Beautiful Trauma                       P!nk                    18     1
##  2 Pétala                                 Djavan                  17     1
##  3 Please Don't Stop The Rain             James Morrison          16     1
##  4 Mean                                   P!nk                    13     1
##  5 Dois Sorrisos                          Leoni                   11     1
##  6 Dos Oruguitas                          Sebastian Yatra         11     1
##  7 One Vision - Remastered 2011           Queen                   11     1
##  8 Preto E Branco - The Voice Brasil 2016 Dan Costa               11     1
##  9 I'm Not In Love                        10cc                    10     1
## 10 Rewrite The Stars                      Zac Efron               10     1
## # ℹ 2,271 more rows

Qtd de minutos tocados por trackName

spotify_full %>% 
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  DT::datatable()
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.

Anlys

tempo por semana

(spotify_full_week <- spotify_full %>% 
  mutate(ts_wday = lubridate::wday(ts, 
                                   label = TRUE,
                                   week_start = 1)) %>% 
  mutate(ts_week = lubridate::week(ts)) %>% 
  group_by(ts_wday, ts_week) %>% 
  summarise(min_played = sum(min_played)) %>%
  ungroup() %>% 
  select(-ts_week))
## `summarise()` has grouped output by 'ts_wday'. You can override using the
## `.groups` argument.
## # A tibble: 370 × 2
##    ts_wday min_played
##    <ord>        <dbl>
##  1 seg           1634
##  2 seg           2237
##  3 seg           1399
##  4 seg           1640
##  5 seg           2112
##  6 seg           2312
##  7 seg           2005
##  8 seg           1508
##  9 seg           1346
## 10 seg           1715
## # ℹ 360 more rows
spotify_full_week %>% 
  group_by(ts_wday) %>%
  skimr::skim() 
Data summary
Name Piped data
Number of rows 370
Number of columns 2
_______________________
Column type frequency:
numeric 1
________________________
Group variables ts_wday

Variable type: numeric

skim_variable ts_wday n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
min_played seg 0 1 1565.90 428.49 607 1352 1510.5 1936.75 2320 ▂▂▇▃▅
min_played ter 0 1 1658.36 547.00 184 1443 1693.0 1973.00 2719 ▂▂▇▇▃
min_played qua 0 1 1544.02 513.05 67 1299 1554.0 1769.00 2790 ▁▂▇▅▁
min_played qui 0 1 1627.79 522.18 57 1312 1584.0 2025.00 2588 ▁▂▇▆▅
min_played sex 0 1 1414.11 522.60 267 1062 1287.0 1787.00 2758 ▁▇▅▃▁
min_played sáb 0 1 572.60 328.21 79 352 527.0 700.00 1913 ▆▇▃▁▁
min_played dom 0 1 539.43 269.12 101 330 522.0 687.00 1322 ▆▇▆▂▁
spotify_full_week %>% 
  ggplot(aes(min_played, ts_wday)) +
  #ggridges::geom_density_ridges() 
  geom_boxplot()

tempo por hora

(spotify_full_hour <- spotify_full %>% 
  #mutate(ts_day = lubridate::day(ts)) %>% 
  mutate(ts_wday = lubridate::wday(ts, label = TRUE, week_start = 1)) %>% 
  mutate(ts_hour = lubridate::hour(ts)) %>% 
  group_by(ts_year, ts_date, ts_wday, ts_hour) %>% 
  summarise(min_played = sum(min_played)) %>%
  ungroup())
## `summarise()` has grouped output by 'ts_year', 'ts_date', 'ts_wday'. You can
## override using the `.groups` argument.
## # A tibble: 13,944 × 5
##    ts_year ts_date    ts_wday ts_hour min_played
##      <dbl> <date>     <ord>     <int>      <dbl>
##  1    2014 2014-06-09 seg           3         11
##  2    2014 2014-06-09 seg           4         36
##  3    2014 2014-06-09 seg          14         19
##  4    2014 2014-06-14 sáb          20         42
##  5    2014 2014-06-14 sáb          21         39
##  6    2014 2014-06-14 sáb          23          2
##  7    2014 2014-06-15 dom           0          3
##  8    2014 2014-06-15 dom          17         34
##  9    2014 2014-06-15 dom          18          3
## 10    2014 2014-06-16 seg          13         19
## # ℹ 13,934 more rows
spotify_full_hour %>% 
  select(-ts_date, -ts_year) %>% 
  group_by(ts_hour) %>%
  skimr::skim() 
Data summary
Name Piped data
Number of rows 13944
Number of columns 3
_______________________
Column type frequency:
factor 1
numeric 1
________________________
Group variables ts_hour

Variable type: factor

skim_variable ts_hour n_missing complete_rate ordered n_unique top_counts
ts_wday 0 0 1 TRUE 7 ter: 111, qua: 104, qui: 93, sex: 87
ts_wday 1 0 1 TRUE 7 ter: 91, qua: 77, qui: 75, sex: 70
ts_wday 2 0 1 TRUE 7 qua: 51, qui: 49, sáb: 48, ter: 47
ts_wday 3 0 1 TRUE 7 qua: 36, sex: 32, ter: 30, sáb: 29
ts_wday 4 0 1 TRUE 7 sáb: 23, qua: 22, qui: 18, sex: 17
ts_wday 5 0 1 TRUE 7 qua: 16, qui: 15, sex: 15, sáb: 13
ts_wday 6 0 1 TRUE 6 sex: 15, sáb: 12, qua: 9, qui: 9
ts_wday 7 0 1 TRUE 7 sex: 13, sáb: 11, qua: 10, qui: 9
ts_wday 8 0 1 TRUE 7 sex: 19, qui: 16, seg: 15, ter: 14
ts_wday 9 0 1 TRUE 7 seg: 43, qui: 43, sex: 43, ter: 41
ts_wday 10 0 1 TRUE 7 qua: 95, sex: 93, qui: 91, ter: 88
ts_wday 11 0 1 TRUE 7 ter: 168, qui: 152, sex: 146, qua: 139
ts_wday 12 0 1 TRUE 7 ter: 182, qui: 181, sex: 175, qua: 156
ts_wday 13 0 1 TRUE 7 ter: 169, qua: 160, qui: 155, sex: 148
ts_wday 14 0 1 TRUE 7 ter: 169, seg: 158, qui: 152, qua: 143
ts_wday 15 0 1 TRUE 7 seg: 154, ter: 151, qui: 150, qua: 138
ts_wday 16 0 1 TRUE 7 seg: 138, ter: 137, qua: 132, qui: 126
ts_wday 17 0 1 TRUE 7 qui: 162, qua: 161, seg: 155, ter: 155
ts_wday 18 0 1 TRUE 7 qui: 161, seg: 159, qua: 158, ter: 157
ts_wday 19 0 1 TRUE 7 ter: 167, seg: 164, qui: 159, qua: 151
ts_wday 20 0 1 TRUE 7 ter: 176, seg: 164, qua: 159, qui: 155
ts_wday 21 0 1 TRUE 7 ter: 188, seg: 173, qui: 166, qua: 162
ts_wday 22 0 1 TRUE 7 seg: 163, ter: 152, qui: 138, qua: 129
ts_wday 23 0 1 TRUE 7 seg: 121, qua: 121, qui: 120, ter: 112

Variable type: numeric

skim_variable ts_hour n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
min_played 0 0 1 30.52 21.92 0 11.00 28.0 51.00 125 ▇▅▅▁▁
min_played 1 0 1 26.89 22.34 0 6.00 23.0 43.00 164 ▇▅▁▁▁
min_played 2 0 1 25.40 22.36 0 5.00 19.0 45.50 124 ▇▃▃▁▁
min_played 3 0 1 22.49 25.78 0 3.00 12.0 42.00 193 ▇▃▁▁▁
min_played 4 0 1 26.46 23.45 0 3.00 19.0 54.75 66 ▇▃▂▁▆
min_played 5 0 1 32.86 30.81 0 7.00 24.0 59.00 191 ▇▅▁▁▁
min_played 6 0 1 35.77 23.08 0 14.00 39.0 60.00 64 ▅▃▂▃▇
min_played 7 0 1 29.04 25.03 0 4.25 18.5 59.75 65 ▇▂▂▂▆
min_played 8 0 1 24.43 21.74 0 3.00 19.0 40.00 65 ▇▃▃▂▃
min_played 9 0 1 23.52 26.87 0 3.00 17.0 37.00 264 ▇▁▁▁▁
min_played 10 0 1 27.75 21.07 0 7.00 26.0 47.50 137 ▇▅▂▁▁
min_played 11 0 1 30.84 21.35 0 13.00 30.0 48.00 177 ▇▆▁▁▁
min_played 12 0 1 31.99 20.87 0 13.00 32.0 49.00 189 ▇▆▁▁▁
min_played 13 0 1 38.41 21.74 0 20.00 42.0 58.00 151 ▆▇▂▁▁
min_played 14 0 1 38.21 26.47 0 17.00 42.0 58.00 403 ▇▁▁▁▁
min_played 15 0 1 34.07 22.15 0 15.00 35.0 53.00 186 ▇▆▁▁▁
min_played 16 0 1 30.98 22.23 0 11.00 29.0 51.00 152 ▇▆▂▁▁
min_played 17 0 1 35.32 20.78 0 18.00 37.0 56.00 82 ▆▅▅▇▁
min_played 18 0 1 39.51 22.42 0 21.00 45.0 59.00 239 ▇▇▁▁▁
min_played 19 0 1 38.22 21.79 0 19.00 42.0 59.00 98 ▇▆▇▇▁
min_played 20 0 1 37.35 21.67 0 17.00 42.0 58.00 99 ▆▅▇▅▁
min_played 21 0 1 37.58 30.73 0 17.00 40.0 55.00 454 ▇▁▁▁▁
min_played 22 0 1 33.10 31.44 0 10.00 30.0 56.00 586 ▇▁▁▁▁
min_played 23 0 1 32.05 23.06 0 12.00 30.0 54.00 174 ▇▆▁▁▁
spotify_full_hour %>% 
  filter(min_played < 150) %>% 
  ggplot(aes(min_played, as_factor(ts_hour))) +
  #ggridges::geom_density_ridges() 
  geom_boxplot() 

spotify_full_hour %>% 
  mutate(hour_played = min_played/60) %>% 
  ggplot(aes(hour_played, as_factor(ts_hour))) +
  ggridges::geom_density_ridges() 
## Picking joint bandwidth of 0.109

Backlog