Spotify Insights

This is an analysis of my entire spotify streaming history (2015 - 2023).
I downloaded my entire Spotify streaming history here and used the Spotify Web API to extract further information.

Loading Packages

library(jsonlite)
library(dplyr)
library(skimr)
library(httr)
library(purrr)
library(ggplot2)
library(keyring)
library(lubridate)
library(stringr)
library(patchwork)

Loading Data

json_files <- c("Spotify1.json", "Spotify2.json", "Spotify3.json", 
                "Spotify4.json", "Spotify5.json", "Spotify6.json")

load_df <- purrr::map_dfr(json_files, ~fromJSON(.))
skim(load_df)

Data summary
Name	load_df
Number of rows	84141
Number of columns	21
_______________________
Column type frequency:
character	15
logical	4
numeric	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique
ts	0	1.00	20	20	0	78787
username	0	1.00	10	10	0	1
platform	0	1.00	3	71	0	179
conn_country	0	1.00	2	2	0	3
ip_addr_decrypted	6902	0.92	11	15	0	2406
user_agent_decrypted	8434	0.90	7	182	0	24
master_metadata_track_name	11010	0.87	1	114	0	4483
master_metadata_album_artist_name	11010	0.87	1	51	0	1519
master_metadata_album_album_name	11010	0.87	1	209	0	2773
spotify_track_uri	11010	0.87	36	36	0	5529
episode_name	73256	0.13	3	146	0	3082
episode_show_name	73256	0.13	4	61	0	142
spotify_episode_uri	73256	0.13	38	38	0	3087
reason_start	0	1.00	0	10	17	11
reason_end	6902	0.92	0	28	16	12

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
shuffle	0	1.00	0.63	TRU: 52630, FAL: 31511
skipped	66612	0.21	0.45	FAL: 9676, TRU: 7853
offline	0	1.00	0.02	FAL: 82672, TRU: 1469
incognito_mode	0	1.00	0.00	FAL: 84040, TRU: 101

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ms_played	0	1	3.012527e+05	7.812884e+05	0	9914	1.594530e+05	2.043560e+05	1.681762e+07	▇▁▁▁▁
offline_timestamp	0	1	9.846933e+11	7.780133e+11	0	1667166294	1.547336e+12	1.620877e+12	1.665669e+12	▅▁▁▁▇

Cleaning Data

Songs played have a value for master_metadata_track_name and no value for episode_name, and the opposite is true for podcasts played. I’ll use this to separate music and podcasts into separate dataframes. I’ll also filter out songs and podcasts that had 0 ms played.

music_df <- load_df %>%
  filter(is.na(episode_name)) %>%
  filter(!is.na(master_metadata_track_name)) %>%
  filter(ms_played != 0) %>%
  select(ts, ms_played, master_metadata_track_name, master_metadata_album_album_name, 
         master_metadata_album_artist_name, spotify_track_uri) %>%
  filter(!is.na(master_metadata_track_name)) %>%
  mutate(id = stringr::str_replace(spotify_track_uri, "spotify:track:", "")) %>%
  select(-spotify_track_uri) %>%
  rename(track_name = master_metadata_track_name,
         album_name = master_metadata_album_album_name,
         artist = master_metadata_album_artist_name)

podcast_df <- load_df %>%
  filter(!is.na(episode_name)) %>%
  select(ts, platform, ms_played, episode_name, episode_show_name) %>%
  rename(show_name = episode_show_name)

print(paste("First day using Spotify:", min(as.Date(load_df$ts))))

## [1] "First day using Spotify: 2015-04-29"

Hours listened by year

music_hours <- music_df %>%
  mutate(year = lubridate::year(ts)) %>% 
  group_by(year) %>%
  summarize(total_hours_played = sum(ms_played, na.rm = TRUE) / (1000 * 60 * 60))

podcast_hours <- podcast_df %>%
  mutate(year = lubridate::year(ts)) %>% 
  group_by(year) %>%
  summarize(total_hours_played = sum(ms_played, na.rm = TRUE) / (1000 * 60 * 60))

total_hours <- bind_rows(
  mutate(music_hours, category = "Music"),
  mutate(podcast_hours, category = "Podcast"))

ggplot(total_hours, aes(x = factor(year), y = total_hours_played, 
                        fill = factor(category, levels = c("Podcast", "Music")))) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(values = c("#1DB954", "black")) +
  labs(title = "Total Hours Played on Spotify per Year",
       y = "Total Hours Played",
       fill = "Category") +
  scale_x_discrete(labels = unique(total_hours$year)) +
  theme_minimal() +
  theme(axis.title.x = element_blank(), axis.ticks.x = element_blank())

We can see my listening drastically increases in 2021. This is the first full year I worked an office job in which I listened to music for a lot of the day. I also started using Spotify as my main podcast app in 2021 - you can see I briefly tried it in 2018 too!

Let’s look at how much of my overall listening was in 2021-2023.

total_hours %>%
  mutate(year_group = case_when(
    between(year, 2015, 2020) ~ "2015-2020",
    between(year, 2021, 2023) ~ "2021-2023",
    TRUE ~ "Other")) %>%
  group_by(year_group) %>%
  summarise(total_hours = round(sum(total_hours_played))) %>%
  mutate(percentage = round(total_hours / sum(total_hours) * 100, 1)) %>%
  knitr::kable(caption = "Listening Hours - Overall")

Listening Hours - Overall
year_group	total_hours	percentage
2015-2020	1404	19.9
2021-2023	5634	80.1

total_hours %>%
  filter(category == "Music") %>%
  mutate(year_group = case_when(
    between(year, 2015, 2020) ~ "2015-2020",
    between(year, 2021, 2023) ~ "2021-2023",
    TRUE ~ "Other")) %>%
  group_by(year_group) %>%
  summarise(total_hours = round(sum(total_hours_played))) %>%
  mutate(percentage = round(total_hours / sum(total_hours) * 100, 1)) %>%
  knitr::kable(caption = "Listening Hours - Music")

Listening Hours - Music
year_group	total_hours	percentage
2015-2020	929	39
2021-2023	1455	61

total_hours %>%
  filter(category == "Podcast") %>%
  mutate(year_group = case_when(
    between(year, 2015, 2020) ~ "2015-2020",
    between(year, 2021, 2023) ~ "2021-2023",
    TRUE ~ "Other")) %>%
  group_by(year_group) %>%
  summarise(total_hours = round(sum(total_hours_played))) %>%
  mutate(percentage = round(total_hours / sum(total_hours) * 100, 1)) %>%
  knitr::kable(caption = "Listening Hours - Podcasts")

Listening Hours - Podcasts
year_group	total_hours	percentage
2015-2020	475	10.2
2021-2023	4179	89.8

More than 80% of my listening hours were in 2021 - 2023. Almost 90% of podcast hours and 61% of music hours were in the same time period.

Days Listened

Here I’m comparing the number of days I listened to any content (music or podcast) on Spotify vs days I didn’t use it at all. The music_df and podcast_df dataframes only have days with listening in them, so I got the max and min dates from music_df and created a single-column dataframe with all dates in between.

# Minutes of music played per day
music_day <- music_df %>%
  mutate(day = lubridate::date(ts)) %>% 
  group_by(day) %>%
  summarise(total_mins_played = round(sum(ms_played, na.rm = TRUE) / (1000 * 60))) 

# Get all dates 
all_dates <- seq(min(music_day$day), max(music_day$day), by = "days")
all_dates_df <- data.frame(day = all_dates)

# Join with all dates dataframe to get '0' for days where nothing was recorded
music_day <- full_join(all_dates_df, music_day, by = "day")

# Replace NAs with 0
music_day <- music_day %>%
  mutate(total_mins_played = ifelse(is.na(total_mins_played), 0, total_mins_played))

# All the same for podcasts
podcast_day <- podcast_df %>%
  mutate(day = lubridate::date(ts)) %>%
  group_by(day) %>%
  summarise(total_mins_played = round(sum(ms_played, na.rm = TRUE) / (1000 * 60)))

podcast_day <- full_join(all_dates_df, podcast_day, by="day")

podcast_day <- podcast_day %>%
  mutate(total_mins_played = ifelse(is.na(total_mins_played), 0, total_mins_played))

combined_day <- bind_rows(
  mutate(music_day, listening_type = "Music"),
  mutate(podcast_day, listening_type = "Podcasts"))

p1 <- combined_day %>%
  group_by(listening_type, 
           has_listening = ifelse(total_mins_played > 0, "Some Listening", "No Listening")) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = has_listening, 
             y = count, 
             fill = has_listening)) +
  scale_fill_manual(values = c("Black", "#1DB954")) + 
  geom_bar(stat = "identity") +
  facet_wrap(~listening_type) +
  labs(title = "Days Listening - All Time",
       y = "Days") +
  theme_minimal() +
  theme(legend.position = "None", 
        axis.title.x = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1))

In the graph of total hours listened per year I found that there was much more listening from 2021 onwards. I filtered the days listened data for the last three years to see if this looked different to my overall listening.

# Filter for last 3 years
p2 <- combined_day %>%
  filter(lubridate::year(day) %in% c(2021, 2022, 2023)) %>%
  group_by(listening_type, 
           has_listening = ifelse(total_mins_played > 0, "Some Listening", "No Listening")) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = has_listening, 
             y = count, 
             fill = has_listening)) +
  scale_fill_manual(values = c("Black", "#1DB954")) + 
  geom_bar(stat = "identity") +
  facet_wrap(~listening_type) +
  labs(title = "Days Listening - 2021 to 2023",
       y = "Days") +
  theme_minimal() +
  theme(legend.position = "None", 
        axis.title.x = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1))

p1|p2

There are far fewer days with no listening in 2021 - 2023 which makes sense given the increase in listening over this time period. This is especially true for podcasts which again makes sense as the majority of my all-time listening was from 2021 onwards.

Top Artists

# Top artists - all time
music_df %>%
  group_by(artist) %>%
  summarize(total_hours_played = sum(ms_played, na.rm = TRUE) / (1000 * 60 * 60)) %>%
  ungroup() %>%
  top_n(10, wt = total_hours_played) %>%
  arrange(desc(total_hours_played)) %>%
  ggplot(aes(x = reorder(artist, -total_hours_played), 
             y = total_hours_played)) +
  geom_bar(stat = "identity", 
           fill = "#1DB954", 
           color = "black") +
  labs(title = "All Time Artist Hours Played on Spotify",
       y = "Total Hours Played") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        axis.title.x = element_blank())

I also tried filtering my top artists for 2021 onwards and pre-2021. I would expect my overall music listening to most closely resemble the data for 2021 - 2023 as this time period accounts for >60% of music hours.

# Top Artists - 2021 to 2023
music_df %>%
  mutate(ts = ymd_hms(ts)) %>%
  filter(year(ts) %in% c(2021, 2022, 2023)) %>%
  group_by(artist) %>%
  summarize(total_hours_played = sum(ms_played, na.rm = TRUE) / (1000 * 60 * 60)) %>%
  ungroup() %>%
  top_n(10, wt = total_hours_played) %>%
  arrange(desc(total_hours_played)) %>%
  ggplot(aes(x = reorder(artist, -total_hours_played), 
             y = total_hours_played)) +
  geom_bar(stat = "identity", 
           fill = "#1DB954", 
           color = "black") +
  labs(title = "Artist Hours Played on Spotify - 2021 to 2023",
       y = "Total Hours Played") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        axis.title.x = element_blank())

# Top Artists - pre-2021
music_df %>%
  mutate(ts = ymd_hms(ts)) %>%
  filter(!year(ts) %in% c(2021, 2022, 2023)) %>%
  group_by(artist) %>%
  summarize(total_hours_played = sum(ms_played, na.rm = TRUE) / (1000 * 60 * 60)) %>%
  ungroup() %>%
  top_n(10, wt = total_hours_played) %>%
  arrange(desc(total_hours_played)) %>%
  ggplot(aes(x = reorder(artist, -total_hours_played), 
             y = total_hours_played)) +
  geom_bar(stat = "identity", 
           fill = "#1DB954", 
           color = "black") +
  labs(title = "Artist Hours Played on Spotify - 2015-2020",
       y = "Total Hours Played") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        axis.title.x = element_blank())

My top artists pre-2021 is significantly different to my overall and 2021 onward top artists. Most noteably my top 2 artists overall, Qveen Herby and Taylor Swift, are at positions 8 and 10 respectively.

Rank of Top 5 Artists Over Time

Here I tracked the rank over time of my top 5 all-time artists.

# Get top 5 artists of all time
top_5_artists <- music_df %>%
  mutate(year = lubridate::year(ts)) %>% 
  group_by(artist, ) %>%
  summarize(total_hours_played = sum(ms_played, na.rm = TRUE) / (1000 * 60 * 60)) %>%
  arrange(desc(total_hours_played)) %>%
  top_n(5, wt = total_hours_played)

ranking <- music_df %>%
  mutate(year = lubridate::year(ts)) %>%
  group_by(year, artist) %>%
  summarise(hours_played = sum(ms_played, na.rm=TRUE) / (1000 * 60 * 60)) %>%
  mutate(rank = row_number(desc(hours_played))) %>%
  ungroup() %>%
  filter(artist %in% as.list(top_5_artists$artist))

ggplot(ranking, aes(x = year, 
                    y = rank, 
                    color = artist)) +
  geom_point() +
  geom_line() +
  labs(title = "Top 5 Artists Over Time",
       y = "Rank") +
  theme_minimal() +
scale_y_reverse(breaks = seq(0, max(ranking$rank), by = 10)) +
scale_x_continuous(breaks = unique(ranking$year)) +
  theme(axis.title.x = element_blank(),
        panel.grid.minor = element_blank())

We can see that Taylor Swift has a huge range of rankings over time!

ranking %>%
  filter(artist == "Taylor Swift") %>%
  mutate(Lowest_Ranking = max(rank),
         Highest_Ranking = min(rank)) %>%
  select(artist, Lowest_Ranking, Highest_Ranking) %>%
  slice_head(n=1)

To make the plots easier to interpret, I’ll separate Taylor Swift from the other 4 artists.

ranking_ts <- ranking %>%
  filter(artist == "Taylor Swift")

ggplot(ranking_ts, aes(x = year, 
                       y = rank, 
                       color = artist)) +
  geom_line() + 
  geom_point() +
  labs(title = "Taylor Swift Rank Over Time",
       y = "Rank") +
  scale_y_reverse() +
  theme(legend.position = "none", 
        axis.title.x = element_blank()) +
  theme_minimal() +
  scale_y_reverse(breaks = seq(0, max(ranking$rank), by = 10)) +
  scale_x_continuous(breaks = unique(ranking$year)) +
  theme(axis.title.x = element_blank(),
        panel.grid.minor = element_blank()) +
  guides(color = FALSE)

ranking_filtered <- ranking %>%
  filter(!artist == "Taylor Swift")

ggplot(ranking_filtered, aes(x = year, 
                             y = rank,
                             color = artist)) +
  geom_line() + 
  geom_point() +
  labs(title = "Top Artists Rank Over Time - Excluding Taylor Swift",
       y = "Rank") +
  theme_minimal() +
  scale_y_reverse(breaks = seq(0, max(ranking$rank), by = 10)) +
  scale_x_continuous(breaks = unique(ranking$year)) +
  theme(axis.title.x = element_blank(),
        panel.grid.minor = element_blank())

The lowest ranking any of the other 4 top artists got was Lily Allen at 30th most listened in 2021.

Top 50 Tracks

Here I’m using the Spotify Web API to get information about Spotify’s popularity rating and explicit-content status of my top 50 songs.

# Top 50 tracks
top_songs <- music_df %>%
  group_by(track_name, id, artist) %>%
  summarize(total_plays = n()) %>%
  ungroup() %>%
  top_n(50, wt = total_plays) %>%
  arrange(desc(total_plays)) 

# Get track IDs
track_ids <- top_songs$id

# Function to get Spotify access token
get_spotify_token <- function(client_id, client_secret) {
  url <- "https://accounts.spotify.com/api/token"
  
  body <- list(
    grant_type = "client_credentials",
    client_id = client_id,
    client_secret = client_secret)
  
  response <- POST(
    url,
    body = body,
    encode = "form",
    add_headers("Content-Type" = "application/x-www-form-urlencoded"))
  
  # Check for a successful response 
  if (status_code(response) == 200) {
    # Extract and return the access token
    content <- fromJSON(rawToChar(response$content))
    if ("access_token" %in% names(content)) {
      return(content$access_token)
    } else {
      stop("Access token not found in the response.")
    }
  } else {
    # Handle the error
    stop(paste("Error:", status_code(response)))
  }
}

# Spotify API credentials
client_id <- keyring::key_get("spotify_api", "client_id")
client_secret <- keyring::key_get("spotify_api", "client_secret")

# Get  access token
access_token <- get_spotify_token(client_id, client_secret)

# Spotify API endpoint for tracks
endpoint <- "https://api.spotify.com/v1/tracks"

# Construct the URL 
url <- paste0(endpoint, "?ids=", paste(track_ids, collapse = ","))

# Make the GET request
response <- httr::GET(url, httr::add_headers(Authorization = paste("Bearer", access_token)))

# Check for a successful response (status code 200)
if (httr::status_code(response) == 200) {
  # Extract the track information without printing
  track_info <- httr::content(response, "parsed")}

json_data <- toJSON(track_info, auto_unbox = TRUE, flatten=TRUE)
track_info_df <- as.data.frame(fromJSON(json_data)) 

top_50_tracks <- track_info %>%
  purrr::pluck("tracks") %>%
  purrr::map_df(~data.frame(
    id = as.character(purrr::pluck(.x, "id")),
    popularity = as.numeric(purrr::pluck(.x, "popularity")),
    explicit = as.logical(purrr::pluck(.x, "explicit"))))

top_50_tracks_join <- top_50_tracks %>% 
  right_join(top_songs, by = "id")

ggplot(top_50_tracks_join, aes(x = popularity)) +
  geom_histogram(binwidth = 10, 
                 fill = "#1DB954", 
                 color = "black") +
  labs(title = "Popularity of Top 50 Tracks")

explicit_counts <- top_50_tracks %>%
  count(explicit) %>%
  mutate(percentage = n / sum(n) * 100)

ggplot(explicit_counts, aes(x = "", y = n, fill = factor(explicit))) +
  geom_bar(stat = "identity", 
           width = 1) +
  geom_text(aes(label = ifelse(explicit, paste0("Explicit\n", round(percentage, 1), "%"), 
                              paste0("Not Explicit\n", round(percentage, 1), "%"))),
            position = position_stack(vjust = 0.5), color = "white") +
  coord_polar("y") +
  labs(title = "Proportion of Explicit Songs",
       y = "") +
  scale_fill_manual(values = c("FALSE" = "black", "TRUE" = "#1DB954"), 
                    labels = c("FALSE" = "Not Explicit", "TRUE" = "Explicit")) +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title.y = element_blank(),
        panel.grid = element_blank(),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5))

Top Podcasts

top_podcasts <- podcast_df %>%
  group_by(show_name) %>%
  summarise(total_hours_played = round(sum(ms_played, na.rm = TRUE) / (1000 * 60 * 60))) %>%
  top_n(5, wt = total_hours_played) %>%
  arrange(desc(total_hours_played)) %>%
  mutate(show_name = str_replace(show_name, "Sawbones: A Marital Tour of Misguided Medicine", "Sawbones")) %>%
  mutate(total_hours_played = as.numeric(total_hours_played),
         show_name = factor(show_name, levels = show_name[order(-total_hours_played)]))

ggplot(top_podcasts, aes(x = show_name, y = total_hours_played)) +
  geom_col(fill = "#1DB954", color = "black") +
  labs(title = "Total Hours Played by Podcast",
       y = "Total Hours Played") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        axis.title.x = element_blank())

top_podcasts

I spent more than three times as much times listening to Let’s Go To Court! than my second most listened podcast. Next I’m going to see what episodes of Let’s Go To Court! that I re-listened to the most. I filtered for records where I listened to more than 15 minutes (900000ms) and kept only one record per episode per day, as pausing and then restarting the same podcaster later on the same day would otherwise be counted twice.

# Most replayed episodes
lgtc <- podcast_df %>%
  filter(show_name == "Let's Go To Court!") %>%
  filter(ms_played >= 900000) %>%
  mutate(day = as.Date(ts)) %>%
  select(episode_name, day) %>%
  distinct() %>%
  group_by(episode_name) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  top_n(5, wt = count) 
  

ggplot(lgtc, aes(x=episode_name, y=count)) +
  geom_col(fill = "#1DB954", color = "black") +
  labs(title = "Most Replayed Episodes of Let's Go To Court!",
       y = "Times Played") +
  theme(axis.title.y = element_blank()) +
  coord_flip()