#remotes::install_github("johncassil/stringr.plus")
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Data

Primeiro você precisa solicitar as suas informações neste site download-your-data , para mais informações sobre as bases disponibilizadas e as respectivas informações, understanding-my-data. E como não são dadas muitas descrições, achei este link que me ajudou a entender melhor as variáveis e seus respectivos níveis (particularmente o reason_start e reason_end: hack_dados_spotifhack_dados_spotify .

historical data

Fiz o download dos dados, salvei em uma pasta, e aí :

#list all JSON files from directory
files_names <- list.files("my_spotify_historical_data/", pattern = "*.json", full.names = T)
  #tentei usar o pacote `fs`, não conhecia, algo não deu certo, e segui sem ele:
  #files_names <- fs::dir_ls(path = "my_spotify_historical_data/", glob = "*.json")


#read them all  
data_list <- files_names %>% map(~jsonlite::fromJSON(.))

#concatenate by row
spotify_full_raw <- data_list %>% bind_rows() 

  #pareceu uma alternativa interessante, mas fica para a Nath do futuro fuçar
  # tibble(file_names = files_names) %>% 
  #   mutate(data = map(files_names, read_csv)) %>% 
  #   unnest()

spotify_full_raw %>% glimpse()

## Rows: 177,255
## Columns: 21
## $ ts                                <chr> "2014-06-09T03:49:48Z", "2014-06-09T…
## $ username                          <chr> "12143382095", "12143382095", "12143…
## $ platform                          <chr> "iOS 7.1.1 (iPad2,5)", "iOS 7.1.1 (i…
## $ ms_played                         <int> 211086, 194079, 231546, 201506, 1885…
## $ conn_country                      <chr> "BR", "BR", "BR", "BR", "BR", "BR", …
## $ ip_addr_decrypted                 <chr> "177.148.215.115", "177.148.215.115"…
## $ user_agent_decrypted              <chr> "unknown", "unknown", "unknown", "un…
## $ master_metadata_track_name        <chr> "Sober", "Money On My Mind", "Sweet …
## $ master_metadata_album_artist_name <chr> "Elli Ingram", "Sam Smith", "The Tem…
## $ master_metadata_album_album_name  <chr> "Sober EP", "Money On My Mind", "Con…
## $ spotify_track_uri                 <chr> "spotify:track:4ymQoZNLQpis51EmcgAoN…
## $ episode_name                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ episode_show_name                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ spotify_episode_uri               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ reason_start                      <chr> "", "trackdone", "trackdone", "track…
## $ reason_end                        <chr> "trackdone", "trackdone", "trackdone…
## $ shuffle                           <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE,…
## $ skipped                           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
## $ offline                           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
## $ offline_timestamp                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ incognito_mode                    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…

Infos Description

Date and time of when the stream ended in UTC format (Coordinated Universal Time zone).
Your Spotify username.
Platform used when streaming the track (e.g. Android OS, Google Chromecast).
For how many milliseconds the track was played.
Country code of the country where the stream was played.
IP address used when streaming the track.
User agent used when streaming the track (e.g. a browser, like Mozilla Firefox, or Safari).
Name of the track.
Name of the artist, band or podcast.
Name of the album of the track.
A Spotify Track URI, that is identifying the unique music track.
Name of the episode of the podcast.
Name of the show of the podcast.
A Spotify Episode URI, that is identifying the unique podcast episode.
Reason why the track started (e.g. previous track finished or you picked it from the playlist).
Reason why the track ended (e.g. the track finished playing or you hit the next button).
Whether shuffle mode was used when playing the track.
Information whether the user skipped to the next song.
Information whether the track was played in offline mode.
Timestamp of when offline mode was used, if it was used.
Information whether the track was played during a private session.

Dataprep

spotify_full <- spotify_full_raw %>% #glimpse()
  select(-username, -platform, -ip_addr_decrypted, 
         -user_agent_decrypted, -spotify_track_uri,  -spotify_episode_uri) %>% 
  mutate(ts = lubridate::as_datetime(ts, format="%Y-%m-%dT%H:%M:%SZ")) %>% 
  mutate(ts_date = lubridate::date(ts)) %>% 
  mutate(ts_year = lubridate::year(ts)) %>% 
  mutate(min_played = round(ms_played*1.7*10^(-5))) %>% 
  relocate(min_played, .after = ms_played) %>% 
  rename(track_name = master_metadata_track_name) %>% 
  rename(artist_name = master_metadata_album_artist_name) %>% 
  rename(album_name = master_metadata_album_album_name) %>% 
  tibble() %>% 
  glimpse()

## Rows: 177,255
## Columns: 18
## $ ts                <dttm> 2014-06-09 03:49:48, 2014-06-09 03:53:02, 2014-06-0…
## $ ms_played         <int> 211086, 194079, 231546, 201506, 188586, 257386, 2074…
## $ min_played        <dbl> 4, 3, 4, 3, 3, 4, 4, 4, 3, 3, 4, 4, 4, 0, 1, 1, 0, 0…
## $ conn_country      <chr> "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR"…
## $ track_name        <chr> "Sober", "Money On My Mind", "Sweet Disposition", "H…
## $ artist_name       <chr> "Elli Ingram", "Sam Smith", "The Temper Trap", "Hot …
## $ album_name        <chr> "Sober EP", "Money On My Mind", "Conditions", "Whate…
## $ episode_name      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ episode_show_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ reason_start      <chr> "", "trackdone", "trackdone", "trackdone", "trackdon…
## $ reason_end        <chr> "trackdone", "trackdone", "trackdone", "trackdone", …
## $ shuffle           <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
## $ skipped           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ offline           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ offline_timestamp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ incognito_mode    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ ts_date           <date> 2014-06-09, 2014-06-09, 2014-06-09, 2014-06-09, 201…
## $ ts_year           <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014…

Descriptions

ts_year

spotify_full %>%janitor::tabyl(ts_year)

##  ts_year     n     percent
##     2014   210 0.001184734
##     2016 16799 0.094773067
##     2017 39737 0.224179854
##     2018 47648 0.268810471
##     2019 32678 0.184355871
##     2020 15130 0.085357254
##     2021  9767 0.055101408
##     2022  9921 0.055970212
##     2023  5365 0.030267129

parece que tem um “pulo” do ano de 2015, mas não sei dizer se isso corresponde com a realidade. Não lembro!

spotify_full %>%
  filter(between(ts_year, 2014, 2016)) %>%  
  janitor::tabyl(ts_date) %>% 
  head(n=15)

##     ts_date  n      percent
##  2014-06-09 28 0.0016461873
##  2014-06-14 27 0.0015873949
##  2014-06-15 14 0.0008230937
##  2014-06-16 11 0.0006467164
##  2014-06-19 15 0.0008818861
##  2014-06-20 27 0.0015873949
##  2014-06-22 14 0.0008230937
##  2014-06-23  5 0.0002939620
##  2014-07-02  9 0.0005291316
##  2014-07-07 10 0.0005879240
##  2014-07-08  4 0.0002351696
##  2014-08-09  8 0.0004703392
##  2014-09-12  3 0.0001763772
##  2014-10-25 35 0.0020577341
##  2016-04-19  1 0.0000587924

conn_country

spotify_full %>% janitor::tabyl(conn_country) %>% arrange(-n)

##  conn_country      n      percent
##            BR 169911 0.9585681645
##            PT   4475 0.0252461144
##            ZZ   1557 0.0087839553
##            US    553 0.0031197992
##            DE    299 0.0016868354
##            IE    218 0.0012298666
##            AT    207 0.0011678091
##            NL     35 0.0001974556

tem inconsistências: GPS não é 100% confiável

só fui uma vez em US, 2018, tem uma linha de 2019!

spotify_full %>% filter(conn_country == "US") %>% 
  janitor::tabyl(ts_year) %>% arrange(ts_year)

##  ts_year   n    percent
##     2018 540 0.97649186
##     2019  13 0.02350814

ZZ é o código para país não identificado, e aparecem alguns assim ao longo dos anos

spotify_full %>% 
  filter(conn_country == "ZZ") %>% 
  janitor::tabyl(ts_year) %>% 
  arrange(ts_year)

##  ts_year   n    percent
##     2017 710 0.45600514
##     2018 717 0.46050096
##     2019 117 0.07514451
##     2020  13 0.00834939

nunca fui na Irlanda (IE)!

spotify_full %>% 
  filter(conn_country == "IE") %>% 
  janitor::tabyl(ts_date) %>% 
  arrange(ts_date)

##     ts_date   n     percent
##  2020-02-10 100 0.458715596
##  2020-04-05   3 0.013761468
##  2020-08-04   1 0.004587156
##  2020-08-11   3 0.013761468
##  2020-08-17   8 0.036697248
##  2022-10-19   5 0.022935780
##  2022-10-20  70 0.321100917
##  2022-10-22  28 0.128440367

mas note que 95% dos dados são “BR”! Então tá tudo certo ;)

episode

spotify_full %>% 
  group_by(episode_show_name) %>%
  count() %>% 
  arrange(-n)

## # A tibble: 34 × 2
## # Groups:   episode_show_name [34]
##    episode_show_name                                                       n
##    <chr>                                                               <int>
##  1 <NA>                                                               177065
##  2 GunCast | Criatividade e Inovação                                      56
##  3 Mario Sergio Cortella - No Meio do Caminho - Mario Sergio Cortella     37
##  4 Mamilos                                                                15
##  5 Data Hackers                                                           12
##  6 Elas Programam                                                          6
##  7 Pizza de Dados                                                          6
##  8 Spotify                                                                 6
##  9 Inédita Pamonha                                                         5
## 10 AMOR E SEXO, contos eróticos narrados                                   4
## # ℹ 24 more rows

claramente eu não sou uma pessoa de podcast…

spotify_full %>% 
  mutate(episode_NA = ifelse(is.na(episode_show_name), "nao-podcast", "podcast")) %>% 
  #group_by(episode_NA) %>%
  janitor::tabyl(episode_NA)

##   episode_NA      n     percent
##  nao-podcast 177065 0.998928098
##      podcast    190 0.001071902

reason_start

Considerando as descrições abaixo, o “clickrow” parece ser interessante, pois indica o interesse na faixa em específico, o “playbtn” também, mas aqui há uma mudança de lista, de estilo. Mas entendo que as duas têm uma natureza similar. O “backbtn” parece bem interessante também, pois indica o comportamento de “repeat”.

“trackdone” - The previous track played to its end and this was the next track to play
“fwdbtn” - The user pressed Next/Forward and this was the next track to play
“clickrow” - The user pressed a specific track in a list of tracks.
“backbtn” - The user pressed Back and this was the previous track to play
“playbtn” - The user pressed Play on a playlist or other list of tracks
“unknown” - The reason why the user (or the app) started playing this track is unknown
“trackerror” - The playback of the previous track ran into some form of error, and this was the next track to play
“remote” - The track was startedon another device and then transferred to this device.

spotify_full %>% 
  janitor::tabyl(reason_start) %>% 
  janitor::adorn_pct_formatting() %>% 
  arrange(-n)

##  reason_start      n percent
##     trackdone 111533   62.9%
##        fwdbtn  45318   25.6%
##      clickrow   8781    5.0%
##       backbtn   4667    2.6%
##       appload   2471    1.4%
##       playbtn   2432    1.4%
##       unknown   1171    0.7%
##    trackerror    613    0.3%
##        remote    227    0.1%
##                   42    0.0%

reason_end

Pelas descrições que seguem, o “trackdone” talvez seja um filtro importante para aplicar em toda a base a depender do objetivo. Já as músicas “fwdbtn” indicam uma falta de interesse direta, e o “endplay” um desinteresse indireto talvez? Na contramão, “backbtn”, assim como o `reason_start`, é sobre o comportamento de “repeat”.

“trackdone” - The track played to its end
“fwdbtn” - The user pressed Next/Forward
“endplay” - The user started playing something else (by clicking a track or Play/Shuffle buttonor similar)
“backbtn” - The user pressed Back
“logout” - The app was shut down or theuser logged out. “remote” - The track playback was moved to another device.
“unexpected-exit” - The app was shut down eitherby the user, or by the operating system, or it crashed.
“unexpected-exit-while-paused” - Theplayback was paused, and at some point after that, the app was shut down either by the user,or by the operating system, or it crashed.
“trackerror” - The playback of theprevious track ran into some form of error, and this was the next track to play
“remote” - The track playback was moved to another device
“clickrow” - The userpressed a specific track in a list of tracks.

spotify_full %>% 
  janitor::tabyl(reason_end) %>% 
  janitor::adorn_pct_formatting() %>% 
  arrange(-n)

##                    reason_end      n percent
##                     trackdone 112251   63.3%
##                        fwdbtn  45618   25.7%
##                       endplay   7840    4.4%
##                       backbtn   4704    2.7%
##                        logout   3881    2.2%
##                       unknown   1351    0.8%
##  unexpected-exit-while-paused   1181    0.7%
##               unexpected-exit    159    0.1%
##                    trackerror    135    0.1%
##                        remote    108    0.1%
##                                   26    0.0%
##                      clickrow      1    0.0%

skipped

info com problema de missing local

Estava super animada para avaliar essa info, mas parece que ela não foi devidamente registrada ao longo de todo o histórico :( Enviei um e-mail para o spotify para entender melhor sobre!

spotify_full %>% janitor::tabyl(skipped) %>% arrange(-n)

##  skipped      n    percent valid_percent
##       NA 168941 0.95309582            NA
##    FALSE   5396 0.03044202     0.6490257
##     TRUE   2918 0.01646216     0.3509743

spotify_full %>% janitor::tabyl(skipped, ts_year)

##  skipped 2014  2016  2017  2018  2019  2020 2021 2022 2023
##    FALSE  133     0     3     0     0     0    0 1624 3636
##     TRUE   77     0     0     0     0     0    0 1112 1729
##       NA    0 16799 39734 47648 32678 15130 9767 7185    0

spotify_full %>% 
  janitor::tabyl(reason_end, skipped) %>% 
  janitor::adorn_totals("col") %>% 
  arrange(-Total)

##                    reason_end FALSE TRUE    NA_  Total
##                     trackdone  5014    0 107237 112251
##                        fwdbtn     0 2154  43464  45618
##                       endplay     0  517   7323   7840
##                       backbtn     0  221   4483   4704
##                        logout   337    0   3544   3881
##                       unknown     0    0   1351   1351
##  unexpected-exit-while-paused    18    0   1163   1181
##               unexpected-exit    14    0    145    159
##                    trackerror     0    0    135    135
##                        remote    13    0     95    108
##                                   0   26      0     26
##                      clickrow     0    0      1      1

incognito_mode

Não sei bem o que é, mas não tem dados suficientes para que vala a pena se preocupar…

spotify_full %>% janitor::tabyl(incognito_mode) %>% arrange(-n)

##  incognito_mode      n      percent
##           FALSE 177251 9.999774e-01
##            TRUE      4 2.256636e-05

Artist vs. track vs. min_played

spotify_full %>%  
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  ungroup() %>% 
  slice_max(min_played, n = 100) %>% 
  DT::datatable()

## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.

album_name

spotify_full %>% 
  group_by(album_name, artist_name) %>% 
  count() %>% 
  arrange(-n) %>% 
  ungroup() %>% 
  slice_head(n = 100) %>% 
  DT::datatable()

artist_name

spotify_full %>% 
  group_by(artist_name) %>% 
  count() %>% 
  arrange(-n)

## # A tibble: 6,326 × 2
## # Groups:   artist_name [6,326]
##    artist_name          n
##    <chr>            <int>
##  1 P!nk              3265
##  2 James Morrison    2765
##  3 Queen             2617
##  4 Marília Mendonça  2551
##  5 John Mayer        1678
##  6 Cássia Eller      1586
##  7 Maroon 5          1500
##  8 Boyce Avenue      1491
##  9 TIAGO IORC        1377
## 10 Marisa Monte      1285
## # ℹ 6,316 more rows

spotify_full %>% 
  group_by(artist_name) %>% 
  count() %>% 
  ungroup() %>% 
  skimr::skim()

Data summary
Name	Piped data
Number of rows	6326
Number of columns	2
_______________________
Column type frequency:
character	1
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
artist_name	1	1	2	91	0	6325	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
n	0	1	28.02	122.42	1	1	3	10	3265	▇▁▁▁▁

track_name

spotify_full %>% 
  group_by(track_name, artist_name) %>% 
  count() %>% 
  arrange(-n)

## # A tibble: 22,268 × 3
## # Groups:   track_name, artist_name [22,268]
##    track_name                               artist_name        n
##    <chr>                                    <chr>          <int>
##  1 Please Don't Stop The Rain               James Morrison   431
##  2 You Give Me Something - (Live fromTokyo) James Morrison   356
##  3 Drops of Jupiter (Tell Me)               Train            343
##  4 Mr. Brightside                           The Killers      338
##  5 Viva                                     Zimbra           319
##  6 Classic                                  MKTO             314
##  7 Resposta                                 Skank            310
##  8 Never Gonna Let You Down                 Colbie Caillat   308
##  9 Sorry Not Sorry - Acoustic               Demi Lovato      307
## 10 Bubbly                                   Colbie Caillat   305
## # ℹ 22,258 more rows

Qtd de minutos tocados por artistName

spotify_full %>% 
  group_by(artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))

## # A tibble: 6,326 × 3
##    artist_name      min_played       p
##    <chr>                 <dbl>   <dbl>
##  1 P!nk                  10808 0.0229 
##  2 James Morrison         9990 0.0212 
##  3 Queen                  9216 0.0196 
##  4 Marília Mendonça       6451 0.0137 
##  5 Maroon 5               4947 0.0105 
##  6 John Mayer             4913 0.0104 
##  7 Cássia Eller           4342 0.00921
##  8 Boyce Avenue           4301 0.00913
##  9 Lady Gaga              3960 0.00840
## 10 U2                     3906 0.00829
## # ℹ 6,316 more rows

spotify_full %>% 
  group_by(artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  skimr::skim()

Data summary
Name	Piped data
Number of rows	6326
Number of columns	2
_______________________
Column type frequency:
character	1
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
artist_name	1	1	2	91	0	6325	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
min_played	0	1	74.5	365.82	0	3	6	24	10808	▇▁▁▁▁

vs. shuffle

spotify_full %>% 
  filter(shuffle == FALSE) %>% 
  group_by(artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))

## # A tibble: 3,658 × 3
##    artist_name      min_played       p
##    <chr>                 <dbl>   <dbl>
##  1 James Morrison         3138 0.0210 
##  2 <NA>                   2163 0.0145 
##  3 P!nk                   2107 0.0141 
##  4 John Mayer             2067 0.0139 
##  5 Marília Mendonça       2022 0.0136 
##  6 Cássia Eller           1777 0.0119 
##  7 Boyce Avenue           1643 0.0110 
##  8 Maria Gadú             1535 0.0103 
##  9 Elvis Presley          1510 0.0101 
## 10 Colbie Caillat         1482 0.00994
## # ℹ 3,648 more rows

vs. reason_start

“playbtn” - The user pressed Play on a playlist or other list of tracks
“clickrow” - The user pressed a specific track in a list of tracks.

spotify_full %>% 
  filter(reason_start %in% c("playbtn","clickrow")) %>% 
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))

## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.

## # A tibble: 4,855 × 4
## # Groups:   track_name [4,530]
##    track_name                                       artist_name min_played     p
##    <chr>                                            <chr>            <dbl> <dbl>
##  1 <NA>                                             <NA>              1386     1
##  2 Please Don't Stop The Rain                       James Morr…        215     1
##  3 Girls Like You (feat. Cardi B) - Cardi B Version Maroon 5           212     1
##  4 Just A Fool                                      Christina …        204     1
##  5 Beautiful Trauma                                 P!nk               185     1
##  6 Sorry Not Sorry - Acoustic                       Demi Lovato        181     1
##  7 You Make It Real                                 James Morr…        127     1
##  8 Shallow - Radio Edit                             Lady Gaga          121     1
##  9 Say It All Over Again                            James Morr…        118     1
## 10 Viva                                             Zimbra             114     1
## # ℹ 4,845 more rows

“backbtn” - The user pressed Back and this was the previous track to play

spotify_full %>% 
  filter(reason_start %in% c("backbtn")) %>% 
  group_by(artist_name, track_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))

## `summarise()` has grouped output by 'artist_name'. You can override using the
## `.groups` argument.

## # A tibble: 2,083 × 4
## # Groups:   artist_name [1,009]
##    artist_name        track_name                                min_played     p
##    <chr>              <chr>                                          <dbl> <dbl>
##  1 Zimbra             Viva                                             192 0.980
##  2 James Morrison     Please Don't Stop The Rain                       153 0.384
##  3 Christina Aguilera Just A Fool                                      149 0.882
##  4 Luciana Mello      Tchau                                            146 1    
##  5 Lady Gaga          Shallow - Radio Edit                             138 0.817
##  6 P!nk               Beautiful Trauma                                 129 0.502
##  7 Zac Efron          Rewrite The Stars                                129 1    
##  8 Gustavo Trebien    Apenas Mais Uma De Amor - The Voice Bras…        114 0.983
##  9 Redbone            Come and Get Your Love - Single Version          109 1    
## 10 Maroon 5           Girls Like You (feat. Cardi B) - Cardi B…        105 0.766
## # ℹ 2,073 more rows

vs. reason_end

“trackdone” - The track played to its end

spotify_full %>% 
  filter(reason_end %in% c("trackdone")) %>% 
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))

## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.

## # A tibble: 16,030 × 4
## # Groups:   track_name [14,296]
##    track_name                               artist_name        min_played     p
##    <chr>                                    <chr>                   <dbl> <dbl>
##  1 Please Don't Stop The Rain               James Morrison           1457 1    
##  2 Viva                                     Zimbra                   1253 1    
##  3 Drops of Jupiter (Tell Me)               Train                    1158 1    
##  4 You Give Me Something - (Live fromTokyo) James Morrison           1147 1    
##  5 Almost Is Never Enough                   Ariana Grande            1145 1    
##  6 Beautiful Trauma                         P!nk                     1081 1    
##  7 Mr. Brightside                           The Killers              1049 1    
##  8 Just A Fool                              Christina Aguilera       1040 1    
##  9 You Make It Real                         James Morrison            986 1    
## 10 A Million Dreams                         P!nk                      981 0.754
## # ℹ 16,020 more rows

“fwdbtn” - The user pressed Next/Forward
“endplay” - The user started playing something else (by clicking a track or Play/Shuffle buttonor similar)

spotify_full %>% 
  filter(reason_end %in% c("fwdbtn", "endplay")) %>% 
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))

## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.

## # A tibble: 13,844 × 4
## # Groups:   track_name [12,570]
##    track_name                                       artist_name min_played     p
##    <chr>                                            <chr>            <dbl> <dbl>
##  1  <NA>                                            <NA>               419 1    
##  2 "Hey Jude - Remastered 2015"                     The Beatles         96 1    
##  3 "Don't Go Away"                                  Oasis               88 0.978
##  4 "Paciência"                                      Lenine              87 1    
##  5 "Sinônimos (Ao Vivo)"                            Zé Ramalho          87 1    
##  6 "Sweet Child O' Mine"                            Taken By T…         87 0.547
##  7 "Can You Feel The Love Tonight/Nants' Ingonyama… Jason Deru…         85 1    
##  8 "Fácil"                                          Jota Quest          85 1    
##  9 "Quando Fui Chuva - Ao Vivo"                     Maria Gadú          82 1    
## 10 "Never Gonna Let You Down"                       Colbie Cai…         81 1    
## # ℹ 13,834 more rows

“backbtn” - The user pressed Back

spotify_full %>% 
  filter(reason_end %in% c("backbtn")) %>% 
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  mutate(p= min_played/sum(min_played))

## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.

## # A tibble: 2,281 × 4
## # Groups:   track_name [2,186]
##    track_name                             artist_name     min_played     p
##    <chr>                                  <chr>                <dbl> <dbl>
##  1 Beautiful Trauma                       P!nk                    18     1
##  2 Pétala                                 Djavan                  17     1
##  3 Please Don't Stop The Rain             James Morrison          16     1
##  4 Mean                                   P!nk                    13     1
##  5 Dois Sorrisos                          Leoni                   11     1
##  6 Dos Oruguitas                          Sebastian Yatra         11     1
##  7 One Vision - Remastered 2011           Queen                   11     1
##  8 Preto E Branco - The Voice Brasil 2016 Dan Costa               11     1
##  9 I'm Not In Love                        10cc                    10     1
## 10 Rewrite The Stars                      Zac Efron               10     1
## # ℹ 2,271 more rows

Qtd de minutos tocados por trackName

spotify_full %>% 
  group_by(track_name, artist_name) %>% 
  summarise(min_played = sum(min_played)) %>% 
  arrange(-min_played) %>% 
  DT::datatable()

## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.

Anlys

tempo por semana

(spotify_full_week <- spotify_full %>% 
  mutate(ts_wday = lubridate::wday(ts, 
                                   label = TRUE,
                                   week_start = 1)) %>% 
  mutate(ts_week = lubridate::week(ts)) %>% 
  group_by(ts_wday, ts_week) %>% 
  summarise(min_played = sum(min_played)) %>%
  ungroup() %>% 
  select(-ts_week))

## `summarise()` has grouped output by 'ts_wday'. You can override using the
## `.groups` argument.

## # A tibble: 370 × 2
##    ts_wday min_played
##    <ord>        <dbl>
##  1 seg           1634
##  2 seg           2237
##  3 seg           1399
##  4 seg           1640
##  5 seg           2112
##  6 seg           2312
##  7 seg           2005
##  8 seg           1508
##  9 seg           1346
## 10 seg           1715
## # ℹ 360 more rows

spotify_full_week %>% 
  group_by(ts_wday) %>%
  skimr::skim()

Data summary
Name	Piped data
Number of rows	370
Number of columns	2
_______________________
Column type frequency:
numeric	1
________________________
Group variables	ts_wday

Variable type: numeric

skim_variable	ts_wday	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
min_played	seg	1	1565.90	428.49	607	1352	1510.5	1936.75	2320	▂▂▇▃▅
min_played	ter	1	1658.36	547.00	184	1443	1693.0	1973.00	2719	▂▂▇▇▃
min_played	qua	1	1544.02	513.05	67	1299	1554.0	1769.00	2790	▁▂▇▅▁
min_played	qui	1	1627.79	522.18	57	1312	1584.0	2025.00	2588	▁▂▇▆▅
min_played	sex	1	1414.11	522.60	267	1062	1287.0	1787.00	2758	▁▇▅▃▁
min_played	sáb	1	572.60	328.21	79	352	527.0	700.00	1913	▆▇▃▁▁
min_played	dom	1	539.43	269.12	101	330	522.0	687.00	1322	▆▇▆▂▁

spotify_full_week %>% 
  ggplot(aes(min_played, ts_wday)) +
  #ggridges::geom_density_ridges() 
  geom_boxplot()

tempo por hora

(spotify_full_hour <- spotify_full %>% 
  #mutate(ts_day = lubridate::day(ts)) %>% 
  mutate(ts_wday = lubridate::wday(ts, label = TRUE, week_start = 1)) %>% 
  mutate(ts_hour = lubridate::hour(ts)) %>% 
  group_by(ts_year, ts_date, ts_wday, ts_hour) %>% 
  summarise(min_played = sum(min_played)) %>%
  ungroup())

## `summarise()` has grouped output by 'ts_year', 'ts_date', 'ts_wday'. You can
## override using the `.groups` argument.

## # A tibble: 13,944 × 5
##    ts_year ts_date    ts_wday ts_hour min_played
##      <dbl> <date>     <ord>     <int>      <dbl>
##  1    2014 2014-06-09 seg           3         11
##  2    2014 2014-06-09 seg           4         36
##  3    2014 2014-06-09 seg          14         19
##  4    2014 2014-06-14 sáb          20         42
##  5    2014 2014-06-14 sáb          21         39
##  6    2014 2014-06-14 sáb          23          2
##  7    2014 2014-06-15 dom           0          3
##  8    2014 2014-06-15 dom          17         34
##  9    2014 2014-06-15 dom          18          3
## 10    2014 2014-06-16 seg          13         19
## # ℹ 13,934 more rows

spotify_full_hour %>% 
  select(-ts_date, -ts_year) %>% 
  group_by(ts_hour) %>%
  skimr::skim()

Data summary
Name	Piped data
Number of rows	13944
Number of columns	3
_______________________
Column type frequency:
factor	1
numeric	1
________________________
Group variables	ts_hour

Variable type: factor

skim_variable	ts_hour	complete_rate	ordered	n_unique	top_counts
ts_wday	0	1	TRUE	7	ter: 111, qua: 104, qui: 93, sex: 87
ts_wday	1	1	TRUE	7	ter: 91, qua: 77, qui: 75, sex: 70
ts_wday	2	1	TRUE	7	qua: 51, qui: 49, sáb: 48, ter: 47
ts_wday	3	1	TRUE	7	qua: 36, sex: 32, ter: 30, sáb: 29
ts_wday	4	1	TRUE	7	sáb: 23, qua: 22, qui: 18, sex: 17
ts_wday	5	1	TRUE	7	qua: 16, qui: 15, sex: 15, sáb: 13
ts_wday	6	1	TRUE	6	sex: 15, sáb: 12, qua: 9, qui: 9
ts_wday	7	1	TRUE	7	sex: 13, sáb: 11, qua: 10, qui: 9
ts_wday	8	1	TRUE	7	sex: 19, qui: 16, seg: 15, ter: 14
ts_wday	9	1	TRUE	7	seg: 43, qui: 43, sex: 43, ter: 41
ts_wday	10	1	TRUE	7	qua: 95, sex: 93, qui: 91, ter: 88
ts_wday	11	1	TRUE	7	ter: 168, qui: 152, sex: 146, qua: 139
ts_wday	12	1	TRUE	7	ter: 182, qui: 181, sex: 175, qua: 156
ts_wday	13	1	TRUE	7	ter: 169, qua: 160, qui: 155, sex: 148
ts_wday	14	1	TRUE	7	ter: 169, seg: 158, qui: 152, qua: 143
ts_wday	15	1	TRUE	7	seg: 154, ter: 151, qui: 150, qua: 138
ts_wday	16	1	TRUE	7	seg: 138, ter: 137, qua: 132, qui: 126
ts_wday	17	1	TRUE	7	qui: 162, qua: 161, seg: 155, ter: 155
ts_wday	18	1	TRUE	7	qui: 161, seg: 159, qua: 158, ter: 157
ts_wday	19	1	TRUE	7	ter: 167, seg: 164, qui: 159, qua: 151
ts_wday	20	1	TRUE	7	ter: 176, seg: 164, qua: 159, qui: 155
ts_wday	21	1	TRUE	7	ter: 188, seg: 173, qui: 166, qua: 162
ts_wday	22	1	TRUE	7	seg: 163, ter: 152, qui: 138, qua: 129
ts_wday	23	1	TRUE	7	seg: 121, qua: 121, qui: 120, ter: 112

Variable type: numeric

skim_variable	ts_hour	complete_rate	mean	sd	p25	p50	p75	p100	hist
min_played	0	1	30.52	21.92	11.00	28.0	51.00	125	▇▅▅▁▁
min_played	1	1	26.89	22.34	6.00	23.0	43.00	164	▇▅▁▁▁
min_played	2	1	25.40	22.36	5.00	19.0	45.50	124	▇▃▃▁▁
min_played	3	1	22.49	25.78	3.00	12.0	42.00	193	▇▃▁▁▁
min_played	4	1	26.46	23.45	3.00	19.0	54.75	66	▇▃▂▁▆
min_played	5	1	32.86	30.81	7.00	24.0	59.00	191	▇▅▁▁▁
min_played	6	1	35.77	23.08	14.00	39.0	60.00	64	▅▃▂▃▇
min_played	7	1	29.04	25.03	4.25	18.5	59.75	65	▇▂▂▂▆
min_played	8	1	24.43	21.74	3.00	19.0	40.00	65	▇▃▃▂▃
min_played	9	1	23.52	26.87	3.00	17.0	37.00	264	▇▁▁▁▁
min_played	10	1	27.75	21.07	7.00	26.0	47.50	137	▇▅▂▁▁
min_played	11	1	30.84	21.35	13.00	30.0	48.00	177	▇▆▁▁▁
min_played	12	1	31.99	20.87	13.00	32.0	49.00	189	▇▆▁▁▁
min_played	13	1	38.41	21.74	20.00	42.0	58.00	151	▆▇▂▁▁
min_played	14	1	38.21	26.47	17.00	42.0	58.00	403	▇▁▁▁▁
min_played	15	1	34.07	22.15	15.00	35.0	53.00	186	▇▆▁▁▁
min_played	16	1	30.98	22.23	11.00	29.0	51.00	152	▇▆▂▁▁
min_played	17	1	35.32	20.78	18.00	37.0	56.00	82	▆▅▅▇▁
min_played	18	1	39.51	22.42	21.00	45.0	59.00	239	▇▇▁▁▁
min_played	19	1	38.22	21.79	19.00	42.0	59.00	98	▇▆▇▇▁
min_played	20	1	37.35	21.67	17.00	42.0	58.00	99	▆▅▇▅▁
min_played	21	1	37.58	30.73	17.00	40.0	55.00	454	▇▁▁▁▁
min_played	22	1	33.10	31.44	10.00	30.0	56.00	586	▇▁▁▁▁
min_played	23	1	32.05	23.06	12.00	30.0	54.00	174	▇▆▁▁▁

spotify_full_hour %>% 
  filter(min_played < 150) %>% 
  ggplot(aes(min_played, as_factor(ts_hour))) +
  #ggridges::geom_density_ridges() 
  geom_boxplot()

spotify_full_hour %>% 
  mutate(hour_played = min_played/60) %>% 
  ggplot(aes(hour_played, as_factor(ts_hour))) +
  ggridges::geom_density_ridges()

## Picking joint bandwidth of 0.109

Backlog

Arrumar o env com as bibliotecas
Escolher uma língua (português ou inglês) e trabalhar SÓ com ela
Pensar em problemas de negócio que este tipo de dado poderia possibilitar

my spotify historical data

from 20140609 to 20230514

Data

historical data

Infos Description

Dataprep

Descriptions

ts_year

parece que tem um “pulo” do ano de 2015, mas não sei dizer se isso corresponde com a realidade. Não lembro!

conn_country

tem inconsistências: GPS não é 100% confiável

episode

claramente eu não sou uma pessoa de podcast…

reason_start

reason_end

skipped

info com problema de missing local

incognito_mode

Não sei bem o que é, mas não tem dados suficientes para que vala a pena se preocupar…

Artist vs. track vs. min_played

album_name

artist_name

track_name

Qtd de minutos tocados por artistName

vs. shuffle

vs. reason_start

vs. reason_end

Qtd de minutos tocados por trackName

Anlys

tempo por semana

tempo por hora

Backlog