library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
spotify_songs <- read.csv("C:/Users/priya/Downloads/spotify_songs.csv")
# Preview the dataset
glimpse(spotify_songs)
## Rows: 32,833
## Columns: 23
## $ track_id <chr> "6f807x0ima9a1j3VPbc7VN", "0r7CVbZTWZgbTCYdfa…
## $ track_name <chr> "I Don't Care (with Justin Bieber) - Loud Lux…
## $ track_artist <chr> "Ed Sheeran", "Maroon 5", "Zara Larsson", "Th…
## $ track_popularity <int> 66, 67, 70, 60, 69, 67, 62, 69, 68, 67, 58, 6…
## $ track_album_id <chr> "2oCs0DGTsRO98Gh5ZSl2Cx", "63rPSO264uRjW1X5E6…
## $ track_album_name <chr> "I Don't Care (with Justin Bieber) [Loud Luxu…
## $ track_album_release_date <chr> "2019-06-14", "2019-12-13", "2019-07-05", "20…
## $ playlist_name <chr> "Pop Remix", "Pop Remix", "Pop Remix", "Pop R…
## $ playlist_id <chr> "37i9dQZF1DXcZDD7cfEKhW", "37i9dQZF1DXcZDD7cf…
## $ playlist_genre <chr> "pop", "pop", "pop", "pop", "pop", "pop", "po…
## $ playlist_subgenre <chr> "dance pop", "dance pop", "dance pop", "dance…
## $ danceability <dbl> 0.748, 0.726, 0.675, 0.718, 0.650, 0.675, 0.4…
## $ energy <dbl> 0.916, 0.815, 0.931, 0.930, 0.833, 0.919, 0.8…
## $ key <int> 6, 11, 1, 7, 1, 8, 5, 4, 8, 2, 6, 8, 1, 5, 5,…
## $ loudness <dbl> -2.634, -4.969, -3.432, -3.778, -4.672, -5.38…
## $ mode <int> 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, …
## $ speechiness <dbl> 0.0583, 0.0373, 0.0742, 0.1020, 0.0359, 0.127…
## $ acousticness <dbl> 0.10200, 0.07240, 0.07940, 0.02870, 0.08030, …
## $ instrumentalness <dbl> 0.00e+00, 4.21e-03, 2.33e-05, 9.43e-06, 0.00e…
## $ liveness <dbl> 0.0653, 0.3570, 0.1100, 0.2040, 0.0833, 0.143…
## $ valence <dbl> 0.518, 0.693, 0.613, 0.277, 0.725, 0.585, 0.1…
## $ tempo <dbl> 122.036, 99.972, 124.008, 121.956, 123.976, 1…
## $ duration_ms <int> 194754, 162600, 176616, 169093, 189052, 16304…
anova_model <- aov(track_popularity ~ playlist_genre, data = spotify_songs)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## playlist_genre 5 627013 125403 207.2 <2e-16 ***
## Residuals 32827 19866852 605
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm_model <- lm(track_popularity ~ danceability + energy + loudness, data = spotify_songs)
summary(lm_model)
##
## Call:
## lm(formula = track_popularity ~ danceability + energy + loudness,
## data = spotify_songs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.806 -17.885 2.968 19.098 79.925
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.45986 1.28864 60.110 < 2e-16 ***
## danceability 6.16664 0.93832 6.572 5.04e-11 ***
## energy -36.79330 1.02164 -36.014 < 2e-16 ***
## loudness 1.98177 0.06164 32.152 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.42 on 32829 degrees of freedom
## Multiple R-squared: 0.04506, Adjusted R-squared: 0.04498
## F-statistic: 516.4 on 3 and 32829 DF, p-value: < 2.2e-16
# Extracting coefficients
coefficients <- summary(lm_model)$coefficients
coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.459864 1.28864363 60.109608 0.000000e+00
## danceability 6.166644 0.93831761 6.572022 5.037886e-11
## energy -36.793301 1.02164453 -36.013799 1.367851e-278
## loudness 1.981769 0.06163723 32.152147 2.424536e-223
# Summarize the ANOVA model
anova_summary <- summary(anova_model)
anova_summary
## Df Sum Sq Mean Sq F value Pr(>F)
## playlist_genre 5 627013 125403 207.2 <2e-16 ***
## Residuals 32827 19866852 605
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Extracting the p-value (single numeric value)
p_value <- anova_summary[[1]][["Pr(>F)"]][1]
# Interpretation of the ANOVA results
if (p_value < 0.05) {
conclusion <- "Reject the null hypothesis. There is a significant difference in track popularity across genres."
} else {
conclusion <- "Fail to reject the null hypothesis. There is no significant difference in track popularity across genres."
}
# Print the p-value and conclusion
p_value
## [1] 2.702657e-218
conclusion
## [1] "Reject the null hypothesis. There is a significant difference in track popularity across genres."
# Check the frequency of categories in 'playlist_genre'
genre_counts <- spotify_songs %>%
count(playlist_genre, sort = TRUE)
# View the top categories (to decide on consolidation)
print(genre_counts)
## playlist_genre n
## 1 edm 6043
## 2 rap 5746
## 3 pop 5507
## 4 r&b 5431
## 5 latin 5155
## 6 rock 4951
# Consolidate less frequent genres into 'Other' category if there are more than 10 categories
spotify_songs <- spotify_songs %>%
mutate(playlist_genre = fct_lump(playlist_genre, n = 10)) # Keep top 10 most frequent genres
# Check the unique categories after consolidation
unique(spotify_songs$playlist_genre)
## [1] pop rap rock latin r&b edm
## Levels: edm latin pop r&b rap rock
# Run the ANOVA test after consolidation
anova_model <- aov(track_popularity ~ playlist_genre, data = spotify_songs)
# Summarize the ANOVA model
anova_summary <- summary(anova_model)
# Extracting the p-value (single numeric value)
p_value <- anova_summary[[1]][["Pr(>F)"]][1]
# Interpretation of the ANOVA results
if (p_value < 0.05) {
conclusion <- "Reject the null hypothesis. There is a significant difference in track popularity across genres."
} else {
conclusion <- "Fail to reject the null hypothesis. There is no significant difference in track popularity across genres."
}
# Print the p-value and conclusion
p_value
## [1] 2.702657e-218
conclusion
## [1] "Reject the null hypothesis. There is a significant difference in track popularity across genres."
ggplot(spotify_songs, aes(x = danceability, y = track_popularity)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "skyblue") +
theme_minimal() +
labs(title = "Relationship between Danceability and Track Popularity",
x = "Danceability", y = "Track Popularity")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(spotify_songs, aes(x = playlist_genre, y = track_popularity)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Track Popularity by Playlist Genre", x = "Playlist Genre", y = "Track Popularity")
If the p-value is less than 0.05 (Reject the null hypothesis):
Explanation: “There is sufficient evidence to conclude that the genre of a song (as represented by playlist_genre) significantly impacts its popularity (track_popularity). This means that different genres tend to have different average popularity levels. Therefore, people interested in optimizing a song’s popularity should carefully consider the genre as a key factor when making decisions about song production or marketing strategies.”
Implication: “For someone in the music industry (e.g., producers or marketers), it would be important to pay attention to the genres that tend to have higher popularity if the goal is to reach a broader audience. Genres that consistently perform better could be prioritized for promotion.”
If the p-value is greater than or equal to 0.05 (Fail to reject the null hypothesis):
Explanation: “There is not enough evidence to conclude that the genre of a song significantly affects its popularity. This suggests that, in this dataset, the differences in the average popularity of songs across genres are not statistically significant. In other words, it would be safe to assume that genre alone may not be a strong predictor of a song’s popularity.”
Implication: “For people interested in this data, it implies that other factors, such as marketing efforts, the artist’s popularity, or specific song characteristics like danceability or energy, may play a more crucial role in determining a song’s popularity. Therefore, focusing on improving those attributes could be a better strategy than relying on genre alone to boost popularity.”
This conclusion helps stakeholders make data-informed decisions based on the outcome of the ANOVA test and shows them how to interpret the influence of genres on popularity.
After performing the ANOVA test on track_popularity based on playlist_genre, we found whether there were statistically significant differences in the mean popularity of songs across different genres. If the p-value was less than 0.05, it indicates that genre plays a significant role in influencing the popularity of a track. Otherwise, if the p-value is greater than 0.05, genre alone does not explain the differences in popularity across the dataset.
This insight is crucial for anyone involved in the music industry (producers, marketers, artists). If genre significantly influences popularity, stakeholders can optimize their efforts based on genre trends. For example, focusing on genres that consistently have higher popularity might be a good strategy to maximize reach. If genre does not significantly influence popularity, then factors other than genre (e.g., marketing, artist reputation) might have a bigger impact.
The linear regression model we built using danceability as a predictor of track_popularity showed the strength and direction of the relationship between these two variables. For instance, a positive coefficient indicates that as the danceability of a song increases, its predicted popularity also increases. The R-squared value tells us how well danceability alone explains the variance in track_popularity.
Understanding the relationship between danceability and popularity helps music producers and artists make informed decisions when creating songs. If the model shows a strong positive relationship, it might suggest that songs with a more rhythmic, danceable quality tend to be more popular. For those aiming to increase song popularity, enhancing the danceability feature could be an effective strategy.
By consolidating genres into fewer categories, we reduced complexity and made the ANOVA test more interpretable. This step is important when there are many categories that might dilute the overall analysis, especially when less frequent categories might not provide meaningful comparisons.
Consolidating categories into the most relevant groups allows for more meaningful statistical analysis. By grouping the top genres and lumping the rest as “Other,” the ANOVA test focuses on key categories, helping stakeholders identify which major genres are most influential. For those in the industry, it simplifies decision-making by highlighting only the most impactful genres, instead of considering too many fragmented categories.
Intercept: This value represents the baseline track popularity when all predictor variables (danceability, energy, and loudness) are set to zero. In a practical context, this value may not be realistic, as it’s rare for all these variables to be zero for a song, but it provides a reference level for popularity when considering the influence of each variable.
Danceability Coefficient: This coefficient indicates the change in track popularity for a one-unit increase in danceability, assuming all other variables remain constant. A positive coefficient implies that higher danceability is associated with higher popularity, while a negative coefficient would suggest the opposite.
Energy Coefficient: This coefficient shows how track popularity changes with a one-unit increase in energy, holding other factors constant. If the coefficient is positive, songs with more energy tend to be more popular; a negative value would suggest that higher energy may decrease popularity.
Loudness Coefficient: This coefficient captures the change in track popularity for each additional unit of loudness, with other factors unchanged. A positive coefficient means that louder songs might be more popular, while a negative one would suggest that loudness could decrease popularity.