# Load required libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Load the dataset
spotify_data <- read.csv("C:/Users/priya/Downloads/spotify_songs.csv")
# -----------------------------------------------
# Step 1: Exploratory Data Analysis (EDA)
# -----------------------------------------------
# Summary of the dataset
summary(spotify_data)
## track_id track_name track_artist track_popularity
## Length:32833 Length:32833 Length:32833 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 24.00
## Mode :character Mode :character Mode :character Median : 45.00
## Mean : 42.48
## 3rd Qu.: 62.00
## Max. :100.00
## track_album_id track_album_name track_album_release_date
## Length:32833 Length:32833 Length:32833
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## playlist_name playlist_id playlist_genre playlist_subgenre
## Length:32833 Length:32833 Length:32833 Length:32833
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## danceability energy key loudness
## Min. :0.0000 Min. :0.000175 Min. : 0.000 Min. :-46.448
## 1st Qu.:0.5630 1st Qu.:0.581000 1st Qu.: 2.000 1st Qu.: -8.171
## Median :0.6720 Median :0.721000 Median : 6.000 Median : -6.166
## Mean :0.6548 Mean :0.698619 Mean : 5.374 Mean : -6.720
## 3rd Qu.:0.7610 3rd Qu.:0.840000 3rd Qu.: 9.000 3rd Qu.: -4.645
## Max. :0.9830 Max. :1.000000 Max. :11.000 Max. : 1.275
## mode speechiness acousticness instrumentalness
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000000
## 1st Qu.:0.0000 1st Qu.:0.0410 1st Qu.:0.0151 1st Qu.:0.0000000
## Median :1.0000 Median :0.0625 Median :0.0804 Median :0.0000161
## Mean :0.5657 Mean :0.1071 Mean :0.1753 Mean :0.0847472
## 3rd Qu.:1.0000 3rd Qu.:0.1320 3rd Qu.:0.2550 3rd Qu.:0.0048300
## Max. :1.0000 Max. :0.9180 Max. :0.9940 Max. :0.9940000
## liveness valence tempo duration_ms
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 4000
## 1st Qu.:0.0927 1st Qu.:0.3310 1st Qu.: 99.96 1st Qu.:187819
## Median :0.1270 Median :0.5120 Median :121.98 Median :216000
## Mean :0.1902 Mean :0.5106 Mean :120.88 Mean :225800
## 3rd Qu.:0.2480 3rd Qu.:0.6930 3rd Qu.:133.92 3rd Qu.:253585
## Max. :0.9960 Max. :0.9910 Max. :239.44 Max. :517810
# Popularity distribution
ggplot(spotify_data, aes(x = track_popularity)) +
geom_histogram(binwidth = 5, fill = "lightblue", color = "black") +
labs(title = "Popularity Distribution", x = "Popularity", y = "Count") +
theme_minimal()
# Correlation heatmap for audio features
audio_features <- spotify_data[, c("energy", "danceability", "acousticness", "track_popularity")]
correlation_matrix <- cor(audio_features)
corrplot(correlation_matrix, method = "color", addCoef.col = "black", tl.col = "black", tl.srt = 45)
# -----------------------------------------------
# Step 2: Hypothesis Testing - Energy vs. Popularity
# -----------------------------------------------
# Box plot for popularity by energy levels
spotify_data <- spotify_data %>%
mutate(energy_category = cut(energy, breaks = 3, labels = c("Low", "Medium", "High")))
ggplot(spotify_data, aes(x = energy_category, y = track_popularity)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Popularity by Energy Levels", x = "Energy Levels", y = "Popularity") +
theme_minimal()
# Scatter plot with trendline (Energy vs. Popularity)
ggplot(spotify_data, aes(x = energy, y = track_popularity)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", col = "red") +
labs(title = "Energy vs. Popularity", x = "Energy", y = "Popularity") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# -----------------------------------------------
# Step 3: Hypothesis Testing - Danceability vs. Popularity
# -----------------------------------------------
# Scatter plot with trendline (Danceability vs. Popularity)
ggplot(spotify_data, aes(x = danceability, y = track_popularity)) +
geom_point(alpha = 0.5, color = "blue") +
geom_smooth(method = "lm", col = "red") +
labs(title = "Danceability vs. Popularity", x = "Danceability", y = "Popularity") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# -----------------------------------------------
# Step 4: Linear Regression Analysis
# -----------------------------------------------
# Fit a linear regression model
linear_model <- lm(track_popularity ~ energy + danceability + acousticness, data = spotify_data)
# Summary of the model
model_summary <- summary(linear_model)
print(model_summary)
##
## Call:
## lm(formula = track_popularity ~ energy + danceability + acousticness,
## data = spotify_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.430 -18.369 2.715 19.421 58.897
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.7865 1.0086 42.420 < 2e-16 ***
## energy -11.1670 0.9043 -12.348 < 2e-16 ***
## danceability 10.1326 0.9496 10.671 < 2e-16 ***
## acousticness 4.8866 0.7423 6.583 4.69e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.78 on 32829 degrees of freedom
## Multiple R-squared: 0.01629, Adjusted R-squared: 0.0162
## F-statistic: 181.2 on 3 and 32829 DF, p-value: < 2.2e-16
# Interpreting coefficients
coefficients <- coef(linear_model)
cat("Intercept:", coefficients[1], "\n")
## Intercept: 42.78646
cat("Energy Coefficient:", coefficients[2], "\n")
## Energy Coefficient: -11.16702
cat("Danceability Coefficient:", coefficients[3], "\n")
## Danceability Coefficient: 10.13258
cat("Acousticness Coefficient:", coefficients[4], "\n")
## Acousticness Coefficient: 4.886646
# Check significant coefficients
significant_coeff <- model_summary$coefficients[, 4] < 0.05
cat("Significant Coefficients:\n", names(coefficients)[significant_coeff], "\n")
## Significant Coefficients:
## (Intercept) energy danceability acousticness
# -----------------------------------------------
# Step 5: Diagnostic Plots
# -----------------------------------------------
# Diagnostic plots for linear regression
par(mfrow = c(2, 2)) # Arrange plots in a 2x2 grid
plot(linear_model)
# Reset plot layout
par(mfrow = c(1, 1))
# -----------------------------------------------
# Step 6: Insights and Conclusion
# -----------------------------------------------
# Key takeaways
cat("\nKey Insights:\n")
##
## Key Insights:
cat("1. Energy has a weak negative relationship with popularity.\n")
## 1. Energy has a weak negative relationship with popularity.
cat("2. Danceability has a strong positive impact on popularity.\n")
## 2. Danceability has a strong positive impact on popularity.
cat("3. Acousticness negatively impacts popularity, consistent with the data trends.\n")
## 3. Acousticness negatively impacts popularity, consistent with the data trends.
# Final model accuracy (R² value)
cat("Model R²:", model_summary$r.squared, "\n")
## Model R²: 0.01629091
# Save updated dataset for future analysis
write.csv(spotify_data, "spotify_data_updated.csv", row.names = FALSE)
# -----------------------------------------------
# Step 7: Save Plots (Optional)
# -----------------------------------------------
# Save box plot
ggsave("popularity_by_energy_levels.png",
ggplot(spotify_data, aes(x = energy_category, y = track_popularity)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Popularity by Energy Levels", x = "Energy Levels", y = "Popularity") +
theme_minimal())
## Saving 7 x 5 in image
# Save scatter plot
ggsave("energy_vs_popularity.png",
ggplot(spotify_data, aes(x = energy, y = track_popularity)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", col = "red") +
labs(title = "Energy vs. Popularity", x = "Energy", y = "Popularity") +
theme_minimal())
## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'
Energy’s Impact: Tracks with higher energy levels tend to have lower popularity, possibly reflecting a listener preference for more mellow or diverse tracks. Danceability’s Importance: Danceability is positively associated with popularity, suggesting that tracks designed for dancing may perform better in terms of popularity. Acousticness and Popularity: Tracks with higher acousticness slightly tend to be more popular, possibly indicating an appreciation for acoustic elements in music. Model Performance: While the model identifies significant relationships, its low R² suggests there are additional unaccounted factors influencing track popularity.
This analysis helps identify which audio features correlate with track popularity, providing valuable insights for music producers, streaming services, and marketers. For example, producers might focus on increasing danceability to target higher popularity, while understanding that energy alone does not guarantee success.
Danceability has a stronger positive relationship with popularity compared to energy and acousticness. Energy negatively impacts popularity, but the relationship is weak. Acousticness has a slight positive impact, aligning with the data trends. The regression model, while significant, explains only a small portion of the variability in popularity, highlighting the complexity of factors affecting track success.