#remotes::install_github("johncassil/stringr.plus")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Primeiro você precisa solicitar as suas informações neste site download-your-data , para
mais informações sobre as bases disponibilizadas e as respectivas
informações, understanding-my-data.
E como não são dadas muitas descrições, achei este link que me ajudou a
entender melhor as variáveis e seus respectivos níveis (particularmente
o reason_start e reason_end: hack_dados_spotifhack_dados_spotify .
Fiz o download dos dados, salvei em uma pasta, e aí :
#list all JSON files from directory
files_names <- list.files("my_spotify_historical_data/", pattern = "*.json", full.names = T)
#tentei usar o pacote `fs`, não conhecia, algo não deu certo, e segui sem ele:
#files_names <- fs::dir_ls(path = "my_spotify_historical_data/", glob = "*.json")
#read them all
data_list <- files_names %>% map(~jsonlite::fromJSON(.))
#concatenate by row
spotify_full_raw <- data_list %>% bind_rows()
#pareceu uma alternativa interessante, mas fica para a Nath do futuro fuçar
# tibble(file_names = files_names) %>%
# mutate(data = map(files_names, read_csv)) %>%
# unnest()
spotify_full_raw %>% glimpse()
## Rows: 177,255
## Columns: 21
## $ ts <chr> "2014-06-09T03:49:48Z", "2014-06-09T…
## $ username <chr> "12143382095", "12143382095", "12143…
## $ platform <chr> "iOS 7.1.1 (iPad2,5)", "iOS 7.1.1 (i…
## $ ms_played <int> 211086, 194079, 231546, 201506, 1885…
## $ conn_country <chr> "BR", "BR", "BR", "BR", "BR", "BR", …
## $ ip_addr_decrypted <chr> "177.148.215.115", "177.148.215.115"…
## $ user_agent_decrypted <chr> "unknown", "unknown", "unknown", "un…
## $ master_metadata_track_name <chr> "Sober", "Money On My Mind", "Sweet …
## $ master_metadata_album_artist_name <chr> "Elli Ingram", "Sam Smith", "The Tem…
## $ master_metadata_album_album_name <chr> "Sober EP", "Money On My Mind", "Con…
## $ spotify_track_uri <chr> "spotify:track:4ymQoZNLQpis51EmcgAoN…
## $ episode_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ episode_show_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ spotify_episode_uri <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ reason_start <chr> "", "trackdone", "trackdone", "track…
## $ reason_end <chr> "trackdone", "trackdone", "trackdone…
## $ shuffle <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE,…
## $ skipped <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
## $ offline <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
## $ offline_timestamp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ incognito_mode <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
spotify_full <- spotify_full_raw %>% #glimpse()
select(-username, -platform, -ip_addr_decrypted,
-user_agent_decrypted, -spotify_track_uri, -spotify_episode_uri) %>%
mutate(ts = lubridate::as_datetime(ts, format="%Y-%m-%dT%H:%M:%SZ")) %>%
mutate(ts_date = lubridate::date(ts)) %>%
mutate(ts_year = lubridate::year(ts)) %>%
mutate(min_played = round(ms_played*1.7*10^(-5))) %>%
relocate(min_played, .after = ms_played) %>%
rename(track_name = master_metadata_track_name) %>%
rename(artist_name = master_metadata_album_artist_name) %>%
rename(album_name = master_metadata_album_album_name) %>%
tibble() %>%
glimpse()
## Rows: 177,255
## Columns: 18
## $ ts <dttm> 2014-06-09 03:49:48, 2014-06-09 03:53:02, 2014-06-0…
## $ ms_played <int> 211086, 194079, 231546, 201506, 188586, 257386, 2074…
## $ min_played <dbl> 4, 3, 4, 3, 3, 4, 4, 4, 3, 3, 4, 4, 4, 0, 1, 1, 0, 0…
## $ conn_country <chr> "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR"…
## $ track_name <chr> "Sober", "Money On My Mind", "Sweet Disposition", "H…
## $ artist_name <chr> "Elli Ingram", "Sam Smith", "The Temper Trap", "Hot …
## $ album_name <chr> "Sober EP", "Money On My Mind", "Conditions", "Whate…
## $ episode_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ episode_show_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ reason_start <chr> "", "trackdone", "trackdone", "trackdone", "trackdon…
## $ reason_end <chr> "trackdone", "trackdone", "trackdone", "trackdone", …
## $ shuffle <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
## $ skipped <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ offline <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ offline_timestamp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ incognito_mode <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ ts_date <date> 2014-06-09, 2014-06-09, 2014-06-09, 2014-06-09, 201…
## $ ts_year <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014…
spotify_full %>%janitor::tabyl(ts_year)
## ts_year n percent
## 2014 210 0.001184734
## 2016 16799 0.094773067
## 2017 39737 0.224179854
## 2018 47648 0.268810471
## 2019 32678 0.184355871
## 2020 15130 0.085357254
## 2021 9767 0.055101408
## 2022 9921 0.055970212
## 2023 5365 0.030267129
spotify_full %>%
filter(between(ts_year, 2014, 2016)) %>%
janitor::tabyl(ts_date) %>%
head(n=15)
## ts_date n percent
## 2014-06-09 28 0.0016461873
## 2014-06-14 27 0.0015873949
## 2014-06-15 14 0.0008230937
## 2014-06-16 11 0.0006467164
## 2014-06-19 15 0.0008818861
## 2014-06-20 27 0.0015873949
## 2014-06-22 14 0.0008230937
## 2014-06-23 5 0.0002939620
## 2014-07-02 9 0.0005291316
## 2014-07-07 10 0.0005879240
## 2014-07-08 4 0.0002351696
## 2014-08-09 8 0.0004703392
## 2014-09-12 3 0.0001763772
## 2014-10-25 35 0.0020577341
## 2016-04-19 1 0.0000587924
spotify_full %>% janitor::tabyl(conn_country) %>% arrange(-n)
## conn_country n percent
## BR 169911 0.9585681645
## PT 4475 0.0252461144
## ZZ 1557 0.0087839553
## US 553 0.0031197992
## DE 299 0.0016868354
## IE 218 0.0012298666
## AT 207 0.0011678091
## NL 35 0.0001974556
spotify_full %>% filter(conn_country == "US") %>%
janitor::tabyl(ts_year) %>% arrange(ts_year)
## ts_year n percent
## 2018 540 0.97649186
## 2019 13 0.02350814
spotify_full %>%
filter(conn_country == "ZZ") %>%
janitor::tabyl(ts_year) %>%
arrange(ts_year)
## ts_year n percent
## 2017 710 0.45600514
## 2018 717 0.46050096
## 2019 117 0.07514451
## 2020 13 0.00834939
spotify_full %>%
filter(conn_country == "IE") %>%
janitor::tabyl(ts_date) %>%
arrange(ts_date)
## ts_date n percent
## 2020-02-10 100 0.458715596
## 2020-04-05 3 0.013761468
## 2020-08-04 1 0.004587156
## 2020-08-11 3 0.013761468
## 2020-08-17 8 0.036697248
## 2022-10-19 5 0.022935780
## 2022-10-20 70 0.321100917
## 2022-10-22 28 0.128440367
spotify_full %>%
group_by(episode_show_name) %>%
count() %>%
arrange(-n)
## # A tibble: 34 × 2
## # Groups: episode_show_name [34]
## episode_show_name n
## <chr> <int>
## 1 <NA> 177065
## 2 GunCast | Criatividade e Inovação 56
## 3 Mario Sergio Cortella - No Meio do Caminho - Mario Sergio Cortella 37
## 4 Mamilos 15
## 5 Data Hackers 12
## 6 Elas Programam 6
## 7 Pizza de Dados 6
## 8 Spotify 6
## 9 Inédita Pamonha 5
## 10 AMOR E SEXO, contos eróticos narrados 4
## # ℹ 24 more rows
spotify_full %>%
mutate(episode_NA = ifelse(is.na(episode_show_name), "nao-podcast", "podcast")) %>%
#group_by(episode_NA) %>%
janitor::tabyl(episode_NA)
## episode_NA n percent
## nao-podcast 177065 0.998928098
## podcast 190 0.001071902
Considerando as descrições abaixo, o “clickrow” parece ser interessante, pois indica o interesse na faixa em específico, o “playbtn” também, mas aqui há uma mudança de lista, de estilo. Mas entendo que as duas têm uma natureza similar. O “backbtn” parece bem interessante também, pois indica o comportamento de “repeat”.
spotify_full %>%
janitor::tabyl(reason_start) %>%
janitor::adorn_pct_formatting() %>%
arrange(-n)
## reason_start n percent
## trackdone 111533 62.9%
## fwdbtn 45318 25.6%
## clickrow 8781 5.0%
## backbtn 4667 2.6%
## appload 2471 1.4%
## playbtn 2432 1.4%
## unknown 1171 0.7%
## trackerror 613 0.3%
## remote 227 0.1%
## 42 0.0%
Pelas descrições que seguem, o “trackdone” talvez seja um filtro importante para aplicar em toda a base a depender do objetivo. Já as músicas “fwdbtn” indicam uma falta de interesse direta, e o “endplay” um desinteresse indireto talvez? Na contramão, “backbtn”, assim como o `reason_start`, é sobre o comportamento de “repeat”.
spotify_full %>%
janitor::tabyl(reason_end) %>%
janitor::adorn_pct_formatting() %>%
arrange(-n)
## reason_end n percent
## trackdone 112251 63.3%
## fwdbtn 45618 25.7%
## endplay 7840 4.4%
## backbtn 4704 2.7%
## logout 3881 2.2%
## unknown 1351 0.8%
## unexpected-exit-while-paused 1181 0.7%
## unexpected-exit 159 0.1%
## trackerror 135 0.1%
## remote 108 0.1%
## 26 0.0%
## clickrow 1 0.0%
Estava super animada para avaliar essa info, mas parece que ela não foi devidamente registrada ao longo de todo o histórico :( Enviei um e-mail para o spotify para entender melhor sobre!
spotify_full %>% janitor::tabyl(skipped) %>% arrange(-n)
## skipped n percent valid_percent
## NA 168941 0.95309582 NA
## FALSE 5396 0.03044202 0.6490257
## TRUE 2918 0.01646216 0.3509743
spotify_full %>% janitor::tabyl(skipped, ts_year)
## skipped 2014 2016 2017 2018 2019 2020 2021 2022 2023
## FALSE 133 0 3 0 0 0 0 1624 3636
## TRUE 77 0 0 0 0 0 0 1112 1729
## NA 0 16799 39734 47648 32678 15130 9767 7185 0
spotify_full %>%
janitor::tabyl(reason_end, skipped) %>%
janitor::adorn_totals("col") %>%
arrange(-Total)
## reason_end FALSE TRUE NA_ Total
## trackdone 5014 0 107237 112251
## fwdbtn 0 2154 43464 45618
## endplay 0 517 7323 7840
## backbtn 0 221 4483 4704
## logout 337 0 3544 3881
## unknown 0 0 1351 1351
## unexpected-exit-while-paused 18 0 1163 1181
## unexpected-exit 14 0 145 159
## trackerror 0 0 135 135
## remote 13 0 95 108
## 0 26 0 26
## clickrow 0 0 1 1
spotify_full %>% janitor::tabyl(incognito_mode) %>% arrange(-n)
## incognito_mode n percent
## FALSE 177251 9.999774e-01
## TRUE 4 2.256636e-05
spotify_full %>%
group_by(track_name, artist_name) %>%
summarise(min_played = sum(min_played)) %>%
ungroup() %>%
slice_max(min_played, n = 100) %>%
DT::datatable()
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.
spotify_full %>%
group_by(album_name, artist_name) %>%
count() %>%
arrange(-n) %>%
ungroup() %>%
slice_head(n = 100) %>%
DT::datatable()
spotify_full %>%
group_by(artist_name) %>%
count() %>%
arrange(-n)
## # A tibble: 6,326 × 2
## # Groups: artist_name [6,326]
## artist_name n
## <chr> <int>
## 1 P!nk 3265
## 2 James Morrison 2765
## 3 Queen 2617
## 4 Marília Mendonça 2551
## 5 John Mayer 1678
## 6 Cássia Eller 1586
## 7 Maroon 5 1500
## 8 Boyce Avenue 1491
## 9 TIAGO IORC 1377
## 10 Marisa Monte 1285
## # ℹ 6,316 more rows
spotify_full %>%
group_by(artist_name) %>%
count() %>%
ungroup() %>%
skimr::skim()
| Name | Piped data |
| Number of rows | 6326 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| artist_name | 1 | 1 | 2 | 91 | 0 | 6325 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| n | 0 | 1 | 28.02 | 122.42 | 1 | 1 | 3 | 10 | 3265 | ▇▁▁▁▁ |
spotify_full %>%
group_by(track_name, artist_name) %>%
count() %>%
arrange(-n)
## # A tibble: 22,268 × 3
## # Groups: track_name, artist_name [22,268]
## track_name artist_name n
## <chr> <chr> <int>
## 1 Please Don't Stop The Rain James Morrison 431
## 2 You Give Me Something - (Live fromTokyo) James Morrison 356
## 3 Drops of Jupiter (Tell Me) Train 343
## 4 Mr. Brightside The Killers 338
## 5 Viva Zimbra 319
## 6 Classic MKTO 314
## 7 Resposta Skank 310
## 8 Never Gonna Let You Down Colbie Caillat 308
## 9 Sorry Not Sorry - Acoustic Demi Lovato 307
## 10 Bubbly Colbie Caillat 305
## # ℹ 22,258 more rows
spotify_full %>%
group_by(artist_name) %>%
summarise(min_played = sum(min_played)) %>%
arrange(-min_played) %>%
mutate(p= min_played/sum(min_played))
## # A tibble: 6,326 × 3
## artist_name min_played p
## <chr> <dbl> <dbl>
## 1 P!nk 10808 0.0229
## 2 James Morrison 9990 0.0212
## 3 Queen 9216 0.0196
## 4 Marília Mendonça 6451 0.0137
## 5 Maroon 5 4947 0.0105
## 6 John Mayer 4913 0.0104
## 7 Cássia Eller 4342 0.00921
## 8 Boyce Avenue 4301 0.00913
## 9 Lady Gaga 3960 0.00840
## 10 U2 3906 0.00829
## # ℹ 6,316 more rows
spotify_full %>%
group_by(artist_name) %>%
summarise(min_played = sum(min_played)) %>%
skimr::skim()
| Name | Piped data |
| Number of rows | 6326 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| artist_name | 1 | 1 | 2 | 91 | 0 | 6325 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| min_played | 0 | 1 | 74.5 | 365.82 | 0 | 3 | 6 | 24 | 10808 | ▇▁▁▁▁ |
spotify_full %>%
filter(shuffle == FALSE) %>%
group_by(artist_name) %>%
summarise(min_played = sum(min_played)) %>%
arrange(-min_played) %>%
mutate(p= min_played/sum(min_played))
## # A tibble: 3,658 × 3
## artist_name min_played p
## <chr> <dbl> <dbl>
## 1 James Morrison 3138 0.0210
## 2 <NA> 2163 0.0145
## 3 P!nk 2107 0.0141
## 4 John Mayer 2067 0.0139
## 5 Marília Mendonça 2022 0.0136
## 6 Cássia Eller 1777 0.0119
## 7 Boyce Avenue 1643 0.0110
## 8 Maria Gadú 1535 0.0103
## 9 Elvis Presley 1510 0.0101
## 10 Colbie Caillat 1482 0.00994
## # ℹ 3,648 more rows
spotify_full %>%
filter(reason_start %in% c("playbtn","clickrow")) %>%
group_by(track_name, artist_name) %>%
summarise(min_played = sum(min_played)) %>%
arrange(-min_played) %>%
mutate(p= min_played/sum(min_played))
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.
## # A tibble: 4,855 × 4
## # Groups: track_name [4,530]
## track_name artist_name min_played p
## <chr> <chr> <dbl> <dbl>
## 1 <NA> <NA> 1386 1
## 2 Please Don't Stop The Rain James Morr… 215 1
## 3 Girls Like You (feat. Cardi B) - Cardi B Version Maroon 5 212 1
## 4 Just A Fool Christina … 204 1
## 5 Beautiful Trauma P!nk 185 1
## 6 Sorry Not Sorry - Acoustic Demi Lovato 181 1
## 7 You Make It Real James Morr… 127 1
## 8 Shallow - Radio Edit Lady Gaga 121 1
## 9 Say It All Over Again James Morr… 118 1
## 10 Viva Zimbra 114 1
## # ℹ 4,845 more rows
spotify_full %>%
filter(reason_start %in% c("backbtn")) %>%
group_by(artist_name, track_name) %>%
summarise(min_played = sum(min_played)) %>%
arrange(-min_played) %>%
mutate(p= min_played/sum(min_played))
## `summarise()` has grouped output by 'artist_name'. You can override using the
## `.groups` argument.
## # A tibble: 2,083 × 4
## # Groups: artist_name [1,009]
## artist_name track_name min_played p
## <chr> <chr> <dbl> <dbl>
## 1 Zimbra Viva 192 0.980
## 2 James Morrison Please Don't Stop The Rain 153 0.384
## 3 Christina Aguilera Just A Fool 149 0.882
## 4 Luciana Mello Tchau 146 1
## 5 Lady Gaga Shallow - Radio Edit 138 0.817
## 6 P!nk Beautiful Trauma 129 0.502
## 7 Zac Efron Rewrite The Stars 129 1
## 8 Gustavo Trebien Apenas Mais Uma De Amor - The Voice Bras… 114 0.983
## 9 Redbone Come and Get Your Love - Single Version 109 1
## 10 Maroon 5 Girls Like You (feat. Cardi B) - Cardi B… 105 0.766
## # ℹ 2,073 more rows
spotify_full %>%
filter(reason_end %in% c("trackdone")) %>%
group_by(track_name, artist_name) %>%
summarise(min_played = sum(min_played)) %>%
arrange(-min_played) %>%
mutate(p= min_played/sum(min_played))
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.
## # A tibble: 16,030 × 4
## # Groups: track_name [14,296]
## track_name artist_name min_played p
## <chr> <chr> <dbl> <dbl>
## 1 Please Don't Stop The Rain James Morrison 1457 1
## 2 Viva Zimbra 1253 1
## 3 Drops of Jupiter (Tell Me) Train 1158 1
## 4 You Give Me Something - (Live fromTokyo) James Morrison 1147 1
## 5 Almost Is Never Enough Ariana Grande 1145 1
## 6 Beautiful Trauma P!nk 1081 1
## 7 Mr. Brightside The Killers 1049 1
## 8 Just A Fool Christina Aguilera 1040 1
## 9 You Make It Real James Morrison 986 1
## 10 A Million Dreams P!nk 981 0.754
## # ℹ 16,020 more rows
spotify_full %>%
filter(reason_end %in% c("fwdbtn", "endplay")) %>%
group_by(track_name, artist_name) %>%
summarise(min_played = sum(min_played)) %>%
arrange(-min_played) %>%
mutate(p= min_played/sum(min_played))
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.
## # A tibble: 13,844 × 4
## # Groups: track_name [12,570]
## track_name artist_name min_played p
## <chr> <chr> <dbl> <dbl>
## 1 <NA> <NA> 419 1
## 2 "Hey Jude - Remastered 2015" The Beatles 96 1
## 3 "Don't Go Away" Oasis 88 0.978
## 4 "Paciência" Lenine 87 1
## 5 "Sinônimos (Ao Vivo)" Zé Ramalho 87 1
## 6 "Sweet Child O' Mine" Taken By T… 87 0.547
## 7 "Can You Feel The Love Tonight/Nants' Ingonyama… Jason Deru… 85 1
## 8 "Fácil" Jota Quest 85 1
## 9 "Quando Fui Chuva - Ao Vivo" Maria Gadú 82 1
## 10 "Never Gonna Let You Down" Colbie Cai… 81 1
## # ℹ 13,834 more rows
spotify_full %>%
filter(reason_end %in% c("backbtn")) %>%
group_by(track_name, artist_name) %>%
summarise(min_played = sum(min_played)) %>%
arrange(-min_played) %>%
mutate(p= min_played/sum(min_played))
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.
## # A tibble: 2,281 × 4
## # Groups: track_name [2,186]
## track_name artist_name min_played p
## <chr> <chr> <dbl> <dbl>
## 1 Beautiful Trauma P!nk 18 1
## 2 Pétala Djavan 17 1
## 3 Please Don't Stop The Rain James Morrison 16 1
## 4 Mean P!nk 13 1
## 5 Dois Sorrisos Leoni 11 1
## 6 Dos Oruguitas Sebastian Yatra 11 1
## 7 One Vision - Remastered 2011 Queen 11 1
## 8 Preto E Branco - The Voice Brasil 2016 Dan Costa 11 1
## 9 I'm Not In Love 10cc 10 1
## 10 Rewrite The Stars Zac Efron 10 1
## # ℹ 2,271 more rows
spotify_full %>%
group_by(track_name, artist_name) %>%
summarise(min_played = sum(min_played)) %>%
arrange(-min_played) %>%
DT::datatable()
## `summarise()` has grouped output by 'track_name'. You can override using the
## `.groups` argument.
(spotify_full_week <- spotify_full %>%
mutate(ts_wday = lubridate::wday(ts,
label = TRUE,
week_start = 1)) %>%
mutate(ts_week = lubridate::week(ts)) %>%
group_by(ts_wday, ts_week) %>%
summarise(min_played = sum(min_played)) %>%
ungroup() %>%
select(-ts_week))
## `summarise()` has grouped output by 'ts_wday'. You can override using the
## `.groups` argument.
## # A tibble: 370 × 2
## ts_wday min_played
## <ord> <dbl>
## 1 seg 1634
## 2 seg 2237
## 3 seg 1399
## 4 seg 1640
## 5 seg 2112
## 6 seg 2312
## 7 seg 2005
## 8 seg 1508
## 9 seg 1346
## 10 seg 1715
## # ℹ 360 more rows
spotify_full_week %>%
group_by(ts_wday) %>%
skimr::skim()
| Name | Piped data |
| Number of rows | 370 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | ts_wday |
Variable type: numeric
| skim_variable | ts_wday | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| min_played | seg | 0 | 1 | 1565.90 | 428.49 | 607 | 1352 | 1510.5 | 1936.75 | 2320 | ▂▂▇▃▅ |
| min_played | ter | 0 | 1 | 1658.36 | 547.00 | 184 | 1443 | 1693.0 | 1973.00 | 2719 | ▂▂▇▇▃ |
| min_played | qua | 0 | 1 | 1544.02 | 513.05 | 67 | 1299 | 1554.0 | 1769.00 | 2790 | ▁▂▇▅▁ |
| min_played | qui | 0 | 1 | 1627.79 | 522.18 | 57 | 1312 | 1584.0 | 2025.00 | 2588 | ▁▂▇▆▅ |
| min_played | sex | 0 | 1 | 1414.11 | 522.60 | 267 | 1062 | 1287.0 | 1787.00 | 2758 | ▁▇▅▃▁ |
| min_played | sáb | 0 | 1 | 572.60 | 328.21 | 79 | 352 | 527.0 | 700.00 | 1913 | ▆▇▃▁▁ |
| min_played | dom | 0 | 1 | 539.43 | 269.12 | 101 | 330 | 522.0 | 687.00 | 1322 | ▆▇▆▂▁ |
spotify_full_week %>%
ggplot(aes(min_played, ts_wday)) +
#ggridges::geom_density_ridges()
geom_boxplot()
(spotify_full_hour <- spotify_full %>%
#mutate(ts_day = lubridate::day(ts)) %>%
mutate(ts_wday = lubridate::wday(ts, label = TRUE, week_start = 1)) %>%
mutate(ts_hour = lubridate::hour(ts)) %>%
group_by(ts_year, ts_date, ts_wday, ts_hour) %>%
summarise(min_played = sum(min_played)) %>%
ungroup())
## `summarise()` has grouped output by 'ts_year', 'ts_date', 'ts_wday'. You can
## override using the `.groups` argument.
## # A tibble: 13,944 × 5
## ts_year ts_date ts_wday ts_hour min_played
## <dbl> <date> <ord> <int> <dbl>
## 1 2014 2014-06-09 seg 3 11
## 2 2014 2014-06-09 seg 4 36
## 3 2014 2014-06-09 seg 14 19
## 4 2014 2014-06-14 sáb 20 42
## 5 2014 2014-06-14 sáb 21 39
## 6 2014 2014-06-14 sáb 23 2
## 7 2014 2014-06-15 dom 0 3
## 8 2014 2014-06-15 dom 17 34
## 9 2014 2014-06-15 dom 18 3
## 10 2014 2014-06-16 seg 13 19
## # ℹ 13,934 more rows
spotify_full_hour %>%
select(-ts_date, -ts_year) %>%
group_by(ts_hour) %>%
skimr::skim()
| Name | Piped data |
| Number of rows | 13944 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | ts_hour |
Variable type: factor
| skim_variable | ts_hour | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|---|
| ts_wday | 0 | 0 | 1 | TRUE | 7 | ter: 111, qua: 104, qui: 93, sex: 87 |
| ts_wday | 1 | 0 | 1 | TRUE | 7 | ter: 91, qua: 77, qui: 75, sex: 70 |
| ts_wday | 2 | 0 | 1 | TRUE | 7 | qua: 51, qui: 49, sáb: 48, ter: 47 |
| ts_wday | 3 | 0 | 1 | TRUE | 7 | qua: 36, sex: 32, ter: 30, sáb: 29 |
| ts_wday | 4 | 0 | 1 | TRUE | 7 | sáb: 23, qua: 22, qui: 18, sex: 17 |
| ts_wday | 5 | 0 | 1 | TRUE | 7 | qua: 16, qui: 15, sex: 15, sáb: 13 |
| ts_wday | 6 | 0 | 1 | TRUE | 6 | sex: 15, sáb: 12, qua: 9, qui: 9 |
| ts_wday | 7 | 0 | 1 | TRUE | 7 | sex: 13, sáb: 11, qua: 10, qui: 9 |
| ts_wday | 8 | 0 | 1 | TRUE | 7 | sex: 19, qui: 16, seg: 15, ter: 14 |
| ts_wday | 9 | 0 | 1 | TRUE | 7 | seg: 43, qui: 43, sex: 43, ter: 41 |
| ts_wday | 10 | 0 | 1 | TRUE | 7 | qua: 95, sex: 93, qui: 91, ter: 88 |
| ts_wday | 11 | 0 | 1 | TRUE | 7 | ter: 168, qui: 152, sex: 146, qua: 139 |
| ts_wday | 12 | 0 | 1 | TRUE | 7 | ter: 182, qui: 181, sex: 175, qua: 156 |
| ts_wday | 13 | 0 | 1 | TRUE | 7 | ter: 169, qua: 160, qui: 155, sex: 148 |
| ts_wday | 14 | 0 | 1 | TRUE | 7 | ter: 169, seg: 158, qui: 152, qua: 143 |
| ts_wday | 15 | 0 | 1 | TRUE | 7 | seg: 154, ter: 151, qui: 150, qua: 138 |
| ts_wday | 16 | 0 | 1 | TRUE | 7 | seg: 138, ter: 137, qua: 132, qui: 126 |
| ts_wday | 17 | 0 | 1 | TRUE | 7 | qui: 162, qua: 161, seg: 155, ter: 155 |
| ts_wday | 18 | 0 | 1 | TRUE | 7 | qui: 161, seg: 159, qua: 158, ter: 157 |
| ts_wday | 19 | 0 | 1 | TRUE | 7 | ter: 167, seg: 164, qui: 159, qua: 151 |
| ts_wday | 20 | 0 | 1 | TRUE | 7 | ter: 176, seg: 164, qua: 159, qui: 155 |
| ts_wday | 21 | 0 | 1 | TRUE | 7 | ter: 188, seg: 173, qui: 166, qua: 162 |
| ts_wday | 22 | 0 | 1 | TRUE | 7 | seg: 163, ter: 152, qui: 138, qua: 129 |
| ts_wday | 23 | 0 | 1 | TRUE | 7 | seg: 121, qua: 121, qui: 120, ter: 112 |
Variable type: numeric
| skim_variable | ts_hour | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| min_played | 0 | 0 | 1 | 30.52 | 21.92 | 0 | 11.00 | 28.0 | 51.00 | 125 | ▇▅▅▁▁ |
| min_played | 1 | 0 | 1 | 26.89 | 22.34 | 0 | 6.00 | 23.0 | 43.00 | 164 | ▇▅▁▁▁ |
| min_played | 2 | 0 | 1 | 25.40 | 22.36 | 0 | 5.00 | 19.0 | 45.50 | 124 | ▇▃▃▁▁ |
| min_played | 3 | 0 | 1 | 22.49 | 25.78 | 0 | 3.00 | 12.0 | 42.00 | 193 | ▇▃▁▁▁ |
| min_played | 4 | 0 | 1 | 26.46 | 23.45 | 0 | 3.00 | 19.0 | 54.75 | 66 | ▇▃▂▁▆ |
| min_played | 5 | 0 | 1 | 32.86 | 30.81 | 0 | 7.00 | 24.0 | 59.00 | 191 | ▇▅▁▁▁ |
| min_played | 6 | 0 | 1 | 35.77 | 23.08 | 0 | 14.00 | 39.0 | 60.00 | 64 | ▅▃▂▃▇ |
| min_played | 7 | 0 | 1 | 29.04 | 25.03 | 0 | 4.25 | 18.5 | 59.75 | 65 | ▇▂▂▂▆ |
| min_played | 8 | 0 | 1 | 24.43 | 21.74 | 0 | 3.00 | 19.0 | 40.00 | 65 | ▇▃▃▂▃ |
| min_played | 9 | 0 | 1 | 23.52 | 26.87 | 0 | 3.00 | 17.0 | 37.00 | 264 | ▇▁▁▁▁ |
| min_played | 10 | 0 | 1 | 27.75 | 21.07 | 0 | 7.00 | 26.0 | 47.50 | 137 | ▇▅▂▁▁ |
| min_played | 11 | 0 | 1 | 30.84 | 21.35 | 0 | 13.00 | 30.0 | 48.00 | 177 | ▇▆▁▁▁ |
| min_played | 12 | 0 | 1 | 31.99 | 20.87 | 0 | 13.00 | 32.0 | 49.00 | 189 | ▇▆▁▁▁ |
| min_played | 13 | 0 | 1 | 38.41 | 21.74 | 0 | 20.00 | 42.0 | 58.00 | 151 | ▆▇▂▁▁ |
| min_played | 14 | 0 | 1 | 38.21 | 26.47 | 0 | 17.00 | 42.0 | 58.00 | 403 | ▇▁▁▁▁ |
| min_played | 15 | 0 | 1 | 34.07 | 22.15 | 0 | 15.00 | 35.0 | 53.00 | 186 | ▇▆▁▁▁ |
| min_played | 16 | 0 | 1 | 30.98 | 22.23 | 0 | 11.00 | 29.0 | 51.00 | 152 | ▇▆▂▁▁ |
| min_played | 17 | 0 | 1 | 35.32 | 20.78 | 0 | 18.00 | 37.0 | 56.00 | 82 | ▆▅▅▇▁ |
| min_played | 18 | 0 | 1 | 39.51 | 22.42 | 0 | 21.00 | 45.0 | 59.00 | 239 | ▇▇▁▁▁ |
| min_played | 19 | 0 | 1 | 38.22 | 21.79 | 0 | 19.00 | 42.0 | 59.00 | 98 | ▇▆▇▇▁ |
| min_played | 20 | 0 | 1 | 37.35 | 21.67 | 0 | 17.00 | 42.0 | 58.00 | 99 | ▆▅▇▅▁ |
| min_played | 21 | 0 | 1 | 37.58 | 30.73 | 0 | 17.00 | 40.0 | 55.00 | 454 | ▇▁▁▁▁ |
| min_played | 22 | 0 | 1 | 33.10 | 31.44 | 0 | 10.00 | 30.0 | 56.00 | 586 | ▇▁▁▁▁ |
| min_played | 23 | 0 | 1 | 32.05 | 23.06 | 0 | 12.00 | 30.0 | 54.00 | 174 | ▇▆▁▁▁ |
spotify_full_hour %>%
filter(min_played < 150) %>%
ggplot(aes(min_played, as_factor(ts_hour))) +
#ggridges::geom_density_ridges()
geom_boxplot()
spotify_full_hour %>%
mutate(hour_played = min_played/60) %>%
ggplot(aes(hour_played, as_factor(ts_hour))) +
ggridges::geom_density_ridges()
## Picking joint bandwidth of 0.109
Arrumar o env com as bibliotecas
Escolher uma língua (português ou inglês) e trabalhar SÓ com ela
Pensar em problemas de negócio que este tipo de dado poderia possibilitar