My Spotify Streaming History Analysis

Introduction

In this project, I have conducted an analysis of my Spotify streaming history which includes information about my listening habits over the course from December 13th 2020 when I created my Spotify account to July 25th 2024. The data was collected via Spotify’s Download your data page. The dataset features various attributes such as time stamps, playtime, track, artist, and album names, and many more. By utilizing data manipulation and visualization techniques I aim to answer many questions that’s been on my mind, such as my most listened artists and my peak listening hours throughout the day. All R packages I will be using are listed below.

knitr::opts_chunk$set(echo = TRUE)

library("dplyr")
library("ggplot2")
library("patchwork")
library("lubridate")
library("forcats")
library("shiny")

Collecting data

When downloading the dataset from Spotify’s website, they were given as .json files. In order to easily work with these files in R, I first converted the .json files to CSV using a converter website. I then imported the CSV files, combined the files and removed all the unnecessary columns I won’t be working with for this project.

data0 <- read.csv("Streaming_History_Audio_2020-2021_0.csv")
data1 <- read.csv("Streaming_History_Audio_2021_1.csv")
data2 <- read.csv("Streaming_History_Audio_2021-2022_2.csv")
data3 <- read.csv("Streaming_History_Audio_2022-2023_3.csv")
data4 <- read.csv("Streaming_History_Audio_2023-2024_4.csv")

stream_data <- rbind(data0, data1, data2, data3, data4)

clean_data <- stream_data %>%
  mutate(hrs_played = ms_played / 3600000) %>%
  select(ts, ms_played, hrs_played, track = master_metadata_track_name, artist = master_metadata_album_artist_name, album = master_metadata_album_album_name, reason_start, reason_end, shuffle, skipped) %>%
  filter(artist != "" & !is.na(artist))

Question 1: How many distinct artists have I listened to since I started using Spotify?

threshold <- 300000

distinct_artists <- clean_data %>%
  group_by(artist) %>%
  summarize(total_ms_played = sum(ms_played)) %>%
  filter(total_ms_played > threshold) %>%
  summarize(count = n()) %>%
  pull()
  
print(paste("I have listened to a total of" , distinct_artists, " distinct artists"))

## [1] "I have listened to a total of 1160  distinct artists"

Question 2: Who are my top 10 artists I’ve listened to the most?

top_artist <- clean_data %>%
  group_by(artist) %>%
  summarize(hrs_played = sum(hrs_played)) %>%
  arrange(desc(hrs_played)) %>%
  slice_head(n = 10) %>%
  print()

## # A tibble: 10 × 2
##    artist             hrs_played
##    <chr>                   <dbl>
##  1 Jack Harlow             189. 
##  2 Mac Miller              142. 
##  3 Kanye West               74.9
##  4 DaBaby                   71.7
##  5 Frank Ocean              65.2
##  6 Lil Tjay                 64.6
##  7 J. Cole                  63.2
##  8 Big Sean                 61.9
##  9 Tyler, The Creator       60.2
## 10 Berner                   56.1

top_artist_plot <- ggplot(top_artist, aes(x = reorder(artist, hrs_played), y = hrs_played)) +
  geom_bar(stat = "identity", fill = "deeppink3") +
  coord_flip() +  
  labs(title= "Top artist played",
       x = "Artist name",
       y = "Hours played") +
  theme_minimal()

print(top_artist_plot)

Question 3: What are my top 10 albums I’ve listened to the most?

top_album <- clean_data %>%
  group_by(album) %>%
  summarize(hrs_played = sum(hrs_played)) %>%
  arrange(desc(hrs_played)) %>%
  slice_head(n = 10) %>%
  print()

## # A tibble: 10 × 2
##    album                             hrs_played
##    <chr>                                  <dbl>
##  1 Thats What They All Say                 58.7
##  2 GO:OD AM                                45.3
##  3 My Beautiful Dark Twisted Fantasy       39.5
##  4 Detroit 2                               34.7
##  5 Swimming                                30.6
##  6 Flower Boy                              29.1
##  7 11/11                                   28.2
##  8 Confetti                                26.5
##  9 BLAME IT ON BABY                        24.6
## 10 K.I.D.S.                                23.8

top_album_plot <- ggplot(top_album, aes(x = reorder(album, hrs_played), y = hrs_played)) +
  geom_bar(stat = "identity", fill = "deeppink3") +
  coord_flip() +
  labs(title= "Top album played",
       x = "Album name",
       y = "Hours played") +
  theme_minimal()

print(top_album_plot)

Question 4, 5 & 6: Peak listening month, weekday, and hour throughout the day

clean_data$ts <- ymd_hms(clean_data$ts, tz = "UTC")

clean_data$month <- month(clean_data$ts)
clean_data$weekday <- wday(clean_data$ts, label = TRUE)
clean_data$hour <- hour(clean_data$ts)

first_date <- clean_data %>%
  select(ts) %>%
  filter(!is.na(ts)) %>%
  slice(1) %>%
  pull()

last_date <- clean_data %>%
  select(ts) %>%
  filter(!is.na(ts)) %>%
  slice(n()) %>%
  pull()

num_years <- as.numeric(difftime(last_date, first_date, units = "days")) %/% 365

cutoff_date <- last_date - years(num_years)

peak_month <- clean_data %>%
  filter(ts >= cutoff_date) %>%
  group_by(month) %>%
  summarize(hrs_played = sum(hrs_played)) %>%
  arrange(month) %>%
  filter(!is.na(month)) %>%
  print()

## # A tibble: 12 × 2
##    month hrs_played
##    <dbl>      <dbl>
##  1     1      164. 
##  2     2      143. 
##  3     3      257. 
##  4     4      167. 
##  5     5       50.8
##  6     6       55.4
##  7     7       93.8
##  8     8      190. 
##  9     9      195. 
## 10    10      176. 
## 11    11      185. 
## 12    12      173.

month_abbreviations <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")

peak_month_plot <- ggplot(peak_month, aes(x = factor(month, levels = 1:12), y = hrs_played)) +
  geom_bar(stat = "identity", fill = "cadetblue") +
  scale_x_discrete(labels = month_abbreviations) + 
  labs(title= "Peak months listened",
       x = "Month",
       y = "Hours played") +
  theme_minimal()

peak_weekday <- clean_data %>%
  group_by(weekday) %>%
  summarize(hrs_played = sum(hrs_played)) %>%
  arrange(weekday) %>%
  filter(!is.na(weekday)) %>%
  print()

## # A tibble: 7 × 2
##   weekday hrs_played
##   <ord>        <dbl>
## 1 Sun           366.
## 2 Mon           489.
## 3 Tue           481.
## 4 Wed           542.
## 5 Thu           501.
## 6 Fri           501.
## 7 Sat           326.

peak_weekday_plot <- ggplot(peak_weekday, aes(x = weekday, y = hrs_played)) +
  geom_bar(stat = "identity", fill = "cadetblue") +
  labs(title= "Peak weekdays listened",
       x = "Weekday",
       y = "Hours played") +
  theme_minimal()

peak_hour <- clean_data %>%
  group_by(hour) %>%
  summarize(hrs_played = sum(hrs_played)) %>%
  arrange(hour) %>%
  filter(!is.na(hour)) %>%
  print()

## # A tibble: 24 × 2
##     hour hrs_played
##    <int>      <dbl>
##  1     0      103. 
##  2     1      131. 
##  3     2      124. 
##  4     3      112. 
##  5     4      117. 
##  6     5      102. 
##  7     6       99.1
##  8     7       83.3
##  9     8       81.8
## 10     9       72.2
## # ℹ 14 more rows

peak_hour_plot <- ggplot(peak_hour, aes(x = hour, y = hrs_played)) +
  geom_bar(stat = "identity", fill = "cadetblue") +
  labs(title= "Peak hours listened",
       x = "Hour",
       y = "Hours played") +
  scale_x_continuous(breaks = seq(0, 23, by = 1)) +
  theme_minimal()

peak_plots <- (peak_month_plot + peak_weekday_plot) / peak_hour_plot

print(peak_plots)