In this project, I have conducted an analysis of my Spotify streaming history which includes information about my listening habits over the course from December 13th 2020 when I created my Spotify account to July 25th 2024. The data was collected via Spotify’s Download your data page. The dataset features various attributes such as time stamps, playtime, track, artist, and album names, and many more. By utilizing data manipulation and visualization techniques I aim to answer many questions that’s been on my mind, such as my most listened artists and my peak listening hours throughout the day. All R packages I will be using are listed below.
knitr::opts_chunk$set(echo = TRUE)
library("dplyr")
library("ggplot2")
library("patchwork")
library("lubridate")
library("forcats")
library("shiny")
When downloading the dataset from Spotify’s website, they were given as .json files. In order to easily work with these files in R, I first converted the .json files to CSV using a converter website. I then imported the CSV files, combined the files and removed all the unnecessary columns I won’t be working with for this project.
data0 <- read.csv("Streaming_History_Audio_2020-2021_0.csv")
data1 <- read.csv("Streaming_History_Audio_2021_1.csv")
data2 <- read.csv("Streaming_History_Audio_2021-2022_2.csv")
data3 <- read.csv("Streaming_History_Audio_2022-2023_3.csv")
data4 <- read.csv("Streaming_History_Audio_2023-2024_4.csv")
stream_data <- rbind(data0, data1, data2, data3, data4)
clean_data <- stream_data %>%
mutate(hrs_played = ms_played / 3600000) %>%
select(ts, ms_played, hrs_played, track = master_metadata_track_name, artist = master_metadata_album_artist_name, album = master_metadata_album_album_name, reason_start, reason_end, shuffle, skipped) %>%
filter(artist != "" & !is.na(artist))
threshold <- 300000
distinct_artists <- clean_data %>%
group_by(artist) %>%
summarize(total_ms_played = sum(ms_played)) %>%
filter(total_ms_played > threshold) %>%
summarize(count = n()) %>%
pull()
print(paste("I have listened to a total of" , distinct_artists, " distinct artists"))
## [1] "I have listened to a total of 1160 distinct artists"
top_artist <- clean_data %>%
group_by(artist) %>%
summarize(hrs_played = sum(hrs_played)) %>%
arrange(desc(hrs_played)) %>%
slice_head(n = 10) %>%
print()
## # A tibble: 10 × 2
## artist hrs_played
## <chr> <dbl>
## 1 Jack Harlow 189.
## 2 Mac Miller 142.
## 3 Kanye West 74.9
## 4 DaBaby 71.7
## 5 Frank Ocean 65.2
## 6 Lil Tjay 64.6
## 7 J. Cole 63.2
## 8 Big Sean 61.9
## 9 Tyler, The Creator 60.2
## 10 Berner 56.1
top_artist_plot <- ggplot(top_artist, aes(x = reorder(artist, hrs_played), y = hrs_played)) +
geom_bar(stat = "identity", fill = "deeppink3") +
coord_flip() +
labs(title= "Top artist played",
x = "Artist name",
y = "Hours played") +
theme_minimal()
print(top_artist_plot)
top_album <- clean_data %>%
group_by(album) %>%
summarize(hrs_played = sum(hrs_played)) %>%
arrange(desc(hrs_played)) %>%
slice_head(n = 10) %>%
print()
## # A tibble: 10 × 2
## album hrs_played
## <chr> <dbl>
## 1 Thats What They All Say 58.7
## 2 GO:OD AM 45.3
## 3 My Beautiful Dark Twisted Fantasy 39.5
## 4 Detroit 2 34.7
## 5 Swimming 30.6
## 6 Flower Boy 29.1
## 7 11/11 28.2
## 8 Confetti 26.5
## 9 BLAME IT ON BABY 24.6
## 10 K.I.D.S. 23.8
top_album_plot <- ggplot(top_album, aes(x = reorder(album, hrs_played), y = hrs_played)) +
geom_bar(stat = "identity", fill = "deeppink3") +
coord_flip() +
labs(title= "Top album played",
x = "Album name",
y = "Hours played") +
theme_minimal()
print(top_album_plot)
clean_data$ts <- ymd_hms(clean_data$ts, tz = "UTC")
clean_data$month <- month(clean_data$ts)
clean_data$weekday <- wday(clean_data$ts, label = TRUE)
clean_data$hour <- hour(clean_data$ts)
first_date <- clean_data %>%
select(ts) %>%
filter(!is.na(ts)) %>%
slice(1) %>%
pull()
last_date <- clean_data %>%
select(ts) %>%
filter(!is.na(ts)) %>%
slice(n()) %>%
pull()
num_years <- as.numeric(difftime(last_date, first_date, units = "days")) %/% 365
cutoff_date <- last_date - years(num_years)
peak_month <- clean_data %>%
filter(ts >= cutoff_date) %>%
group_by(month) %>%
summarize(hrs_played = sum(hrs_played)) %>%
arrange(month) %>%
filter(!is.na(month)) %>%
print()
## # A tibble: 12 × 2
## month hrs_played
## <dbl> <dbl>
## 1 1 164.
## 2 2 143.
## 3 3 257.
## 4 4 167.
## 5 5 50.8
## 6 6 55.4
## 7 7 93.8
## 8 8 190.
## 9 9 195.
## 10 10 176.
## 11 11 185.
## 12 12 173.
month_abbreviations <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
peak_month_plot <- ggplot(peak_month, aes(x = factor(month, levels = 1:12), y = hrs_played)) +
geom_bar(stat = "identity", fill = "cadetblue") +
scale_x_discrete(labels = month_abbreviations) +
labs(title= "Peak months listened",
x = "Month",
y = "Hours played") +
theme_minimal()
peak_weekday <- clean_data %>%
group_by(weekday) %>%
summarize(hrs_played = sum(hrs_played)) %>%
arrange(weekday) %>%
filter(!is.na(weekday)) %>%
print()
## # A tibble: 7 × 2
## weekday hrs_played
## <ord> <dbl>
## 1 Sun 366.
## 2 Mon 489.
## 3 Tue 481.
## 4 Wed 542.
## 5 Thu 501.
## 6 Fri 501.
## 7 Sat 326.
peak_weekday_plot <- ggplot(peak_weekday, aes(x = weekday, y = hrs_played)) +
geom_bar(stat = "identity", fill = "cadetblue") +
labs(title= "Peak weekdays listened",
x = "Weekday",
y = "Hours played") +
theme_minimal()
peak_hour <- clean_data %>%
group_by(hour) %>%
summarize(hrs_played = sum(hrs_played)) %>%
arrange(hour) %>%
filter(!is.na(hour)) %>%
print()
## # A tibble: 24 × 2
## hour hrs_played
## <int> <dbl>
## 1 0 103.
## 2 1 131.
## 3 2 124.
## 4 3 112.
## 5 4 117.
## 6 5 102.
## 7 6 99.1
## 8 7 83.3
## 9 8 81.8
## 10 9 72.2
## # ℹ 14 more rows
peak_hour_plot <- ggplot(peak_hour, aes(x = hour, y = hrs_played)) +
geom_bar(stat = "identity", fill = "cadetblue") +
labs(title= "Peak hours listened",
x = "Hour",
y = "Hours played") +
scale_x_continuous(breaks = seq(0, 23, by = 1)) +
theme_minimal()
peak_plots <- (peak_month_plot + peak_weekday_plot) / peak_hour_plot
print(peak_plots)