This analysis explores clustering patterns in Spotify tracks based on their audio features. We aim to: 1. Identify natural groupings of songs based on their audio characteristics 2. Understand which features contribute most to song popularity 3. Uncover patterns that could inform music production and playlist curation
# Read the dataset
spotify_data <- read.csv("spotify_tracks.csv")
# Display basic information about the dataset
glimpse(spotify_data)## Rows: 232,725
## Columns: 18
## $ genre <chr> "Movie", "Movie", "Movie", "Movie", "Movie", "Movie",…
## $ artist_name <chr> "Henri Salvador", "Martin & les fées", "Joseph Willia…
## $ track_name <chr> "C'est beau de faire un Show", "Perdu d'avance (par G…
## $ track_id <chr> "0BRjO6ga9RKCKjfDqeFgWV", "0BjC1NfoEOOusryehmNudP", "…
## $ popularity <int> 0, 1, 3, 0, 4, 0, 2, 15, 0, 10, 0, 2, 4, 3, 0, 0, 0, …
## $ acousticness <dbl> 0.61100, 0.24600, 0.95200, 0.70300, 0.95000, 0.74900,…
## $ danceability <dbl> 0.389, 0.590, 0.663, 0.240, 0.331, 0.578, 0.703, 0.41…
## $ duration_ms <int> 99373, 137373, 170267, 152427, 82625, 160627, 212293,…
## $ energy <dbl> 0.9100, 0.7370, 0.1310, 0.3260, 0.2250, 0.0948, 0.270…
## $ instrumentalness <dbl> 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.23e-01, 0.0…
## $ key <chr> "C#", "F#", "C", "C#", "F", "C#", "C#", "F#", "C", "G…
## $ liveness <dbl> 0.3460, 0.1510, 0.1030, 0.0985, 0.2020, 0.1070, 0.105…
## $ loudness <dbl> -1.828, -5.559, -13.879, -12.178, -21.150, -14.970, -…
## $ mode <chr> "Major", "Minor", "Minor", "Major", "Major", "Major",…
## $ speechiness <dbl> 0.0525, 0.0868, 0.0362, 0.0395, 0.0456, 0.1430, 0.953…
## $ tempo <dbl> 166.969, 174.003, 99.488, 171.758, 140.576, 87.479, 8…
## $ time_signature <chr> "4/4", "4/4", "5/4", "4/4", "4/4", "4/4", "4/4", "4/4…
## $ valence <dbl> 0.8140, 0.8160, 0.3680, 0.2270, 0.3900, 0.3580, 0.533…
Our dataset contains 232725 tracks with 18 features. Let’s examine the summary statistics:
# Summary of numerical features
summary(spotify_data %>% select(popularity, danceability, energy, loudness, speechiness,
acousticness, instrumentalness, liveness, valence, tempo))## popularity danceability energy loudness
## Min. : 0.00 Min. :0.0569 Min. :2.03e-05 Min. :-52.457
## 1st Qu.: 29.00 1st Qu.:0.4350 1st Qu.:3.85e-01 1st Qu.:-11.771
## Median : 43.00 Median :0.5710 Median :6.05e-01 Median : -7.762
## Mean : 41.13 Mean :0.5544 Mean :5.71e-01 Mean : -9.570
## 3rd Qu.: 55.00 3rd Qu.:0.6920 3rd Qu.:7.87e-01 3rd Qu.: -5.501
## Max. :100.00 Max. :0.9890 Max. :9.99e-01 Max. : 3.744
## speechiness acousticness instrumentalness liveness
## Min. :0.0222 Min. :0.0000 Min. :0.0000000 Min. :0.00967
## 1st Qu.:0.0367 1st Qu.:0.0376 1st Qu.:0.0000000 1st Qu.:0.09740
## Median :0.0501 Median :0.2320 Median :0.0000443 Median :0.12800
## Mean :0.1208 Mean :0.3686 Mean :0.1483012 Mean :0.21501
## 3rd Qu.:0.1050 3rd Qu.:0.7220 3rd Qu.:0.0358000 3rd Qu.:0.26400
## Max. :0.9670 Max. :0.9960 Max. :0.9990000 Max. :1.00000
## valence tempo
## Min. :0.0000 Min. : 30.38
## 1st Qu.:0.2370 1st Qu.: 92.96
## Median :0.4440 Median :115.78
## Mean :0.4549 Mean :117.67
## 3rd Qu.:0.6600 3rd Qu.:139.05
## Max. :1.0000 Max. :242.90
# Select relevant audio features for clustering
audio_features <- spotify_data %>%
select(danceability, energy, key, loudness, mode,
speechiness, acousticness, instrumentalness,
liveness, valence, tempo, duration_ms)
# Check for missing values
missing_values <- colSums(is.na(audio_features))
print("Missing values per feature:")## [1] "Missing values per feature:"
## danceability energy key loudness
## 0 0 0 0
## mode speechiness acousticness instrumentalness
## 0 0 0 0
## liveness valence tempo duration_ms
## 0 0 0 0
# Handle missing values if any
audio_features <- audio_features %>%
mutate_all(~ifelse(is.na(.), mean(., na.rm = TRUE), .))
# Standardize duration_ms (convert to minutes)
audio_features$duration_ms <- audio_features$duration_ms / 60000
colnames(audio_features)[colnames(audio_features) == "duration_ms"] <- "duration_min"# Select only numeric features for correlation matrix
numeric_features <- audio_features %>%
select_if(is.numeric)
# Create correlation matrix
cor_matrix <- cor(numeric_features)
# Visualize correlations
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45,
title = "Correlation Matrix of Numeric Audio Features")Key correlation findings: 1. Energy and loudness show strong positive correlation (r = 0.82) 2. Energy and acousticness have strong negative correlation (r = -0.73) 3. Danceability and valence show moderate positive correlation (r = 0.55)
# Select important features for visualization
key_features <- c("danceability", "energy", "valence", "tempo", "loudness", "acousticness")
# Create plots for each feature
plot_list <- lapply(key_features, function(feature) {
ggplot(spotify_data, aes_string(x = feature)) +
geom_histogram(fill = "steelblue", bins = 30) +
theme_minimal() +
labs(title = paste("Distribution of", feature))
})
# Arrange plots in a grid
do.call(grid.arrange, c(plot_list, ncol = 2))Distribution insights: 1. Danceability shows a roughly normal distribution centered around 0.5 2. Energy is slightly right-skewed 3. Valence (musical positiveness) is fairly uniformly distributed 4. Tempo shows multiple peaks, suggesting different common BPM ranges 5. Loudness follows a normal distribution 6. Acousticness shows a U-shaped distribution, suggesting songs tend to be either very acoustic or very non-acoustic
# Calculate WSS for different k values
wss <- numeric(10)
for (k in 1:10) {
km <- kmeans(scaled_features, centers = k, nstart = 25)
wss[k] <- km$tot.withinss
}
# Plot elbow curve
plot(1:10, wss, type = "b",
xlab = "Number of Clusters (k)",
ylab = "Within-cluster Sum of Squares",
main = "Elbow Method for Optimal k")# Perform PCA
pca_result <- prcomp(scaled_features)
# Calculate variance explained
var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
# Create dataframe for plotting
pca_data <- data.frame(
PC1 = pca_result$x[,1],
PC2 = pca_result$x[,2],
cluster = spotify_data$cluster
)
# Plot clusters
ggplot(pca_data, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.5) +
theme_minimal() +
labs(title = "Cluster Visualization using PCA",
x = paste("PC1 (", round(var_explained[1]*100, 1), "% variance explained)", sep=""),
y = paste("PC2 (", round(var_explained[2]*100, 1), "% variance explained)", sep=""))# Calculate cluster centers
cluster_centers <- aggregate(audio_features,
by = list(Cluster = spotify_data$cluster),
mean)
# Create heatmap of cluster centers
cluster_centers_long <- cluster_centers %>%
gather(key = "feature", value = "value", -Cluster)
ggplot(cluster_centers_long, aes(x = feature, y = Cluster, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Cluster Characteristics Heatmap")Cluster Interpretations: