Load necessary libraries

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.3.3

Load the dataset

spotify_songs <- read.csv("C:/Users/priya/Downloads/spotify_songs.csv")
# Preview the dataset
glimpse(spotify_songs)
## Rows: 32,833
## Columns: 23
## $ track_id                 <chr> "6f807x0ima9a1j3VPbc7VN", "0r7CVbZTWZgbTCYdfa…
## $ track_name               <chr> "I Don't Care (with Justin Bieber) - Loud Lux…
## $ track_artist             <chr> "Ed Sheeran", "Maroon 5", "Zara Larsson", "Th…
## $ track_popularity         <int> 66, 67, 70, 60, 69, 67, 62, 69, 68, 67, 58, 6…
## $ track_album_id           <chr> "2oCs0DGTsRO98Gh5ZSl2Cx", "63rPSO264uRjW1X5E6…
## $ track_album_name         <chr> "I Don't Care (with Justin Bieber) [Loud Luxu…
## $ track_album_release_date <chr> "2019-06-14", "2019-12-13", "2019-07-05", "20…
## $ playlist_name            <chr> "Pop Remix", "Pop Remix", "Pop Remix", "Pop R…
## $ playlist_id              <chr> "37i9dQZF1DXcZDD7cfEKhW", "37i9dQZF1DXcZDD7cf…
## $ playlist_genre           <chr> "pop", "pop", "pop", "pop", "pop", "pop", "po…
## $ playlist_subgenre        <chr> "dance pop", "dance pop", "dance pop", "dance…
## $ danceability             <dbl> 0.748, 0.726, 0.675, 0.718, 0.650, 0.675, 0.4…
## $ energy                   <dbl> 0.916, 0.815, 0.931, 0.930, 0.833, 0.919, 0.8…
## $ key                      <int> 6, 11, 1, 7, 1, 8, 5, 4, 8, 2, 6, 8, 1, 5, 5,…
## $ loudness                 <dbl> -2.634, -4.969, -3.432, -3.778, -4.672, -5.38…
## $ mode                     <int> 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, …
## $ speechiness              <dbl> 0.0583, 0.0373, 0.0742, 0.1020, 0.0359, 0.127…
## $ acousticness             <dbl> 0.10200, 0.07240, 0.07940, 0.02870, 0.08030, …
## $ instrumentalness         <dbl> 0.00e+00, 4.21e-03, 2.33e-05, 9.43e-06, 0.00e…
## $ liveness                 <dbl> 0.0653, 0.3570, 0.1100, 0.2040, 0.0833, 0.143…
## $ valence                  <dbl> 0.518, 0.693, 0.613, 0.277, 0.725, 0.585, 0.1…
## $ tempo                    <dbl> 122.036, 99.972, 124.008, 121.956, 123.976, 1…
## $ duration_ms              <int> 194754, 162600, 176616, 169093, 189052, 16304…

Run ANOVA test to see how ‘popularity’ varies across different music genres (‘track_genre’)

anova_model <- aov(track_popularity ~ playlist_genre, data = spotify_songs)
summary(anova_model)
##                   Df   Sum Sq Mean Sq F value Pr(>F)    
## playlist_genre     5   627013  125403   207.2 <2e-16 ***
## Residuals      32827 19866852     605                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Building a linear regression model with popularity as the response variable

Using ‘danceability’, ‘energy’, and ‘loudness’ as explanatory variables for demonstration

lm_model <- lm(track_popularity ~ danceability + energy + loudness, data = spotify_songs)

Summarize the regression model

summary(lm_model)
## 
## Call:
## lm(formula = track_popularity ~ danceability + energy + loudness, 
##     data = spotify_songs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.806 -17.885   2.968  19.098  79.925 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   77.45986    1.28864  60.110  < 2e-16 ***
## danceability   6.16664    0.93832   6.572 5.04e-11 ***
## energy       -36.79330    1.02164 -36.014  < 2e-16 ***
## loudness       1.98177    0.06164  32.152  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.42 on 32829 degrees of freedom
## Multiple R-squared:  0.04506,    Adjusted R-squared:  0.04498 
## F-statistic: 516.4 on 3 and 32829 DF,  p-value: < 2.2e-16
# Extracting coefficients
coefficients <- summary(lm_model)$coefficients
coefficients
##                Estimate Std. Error    t value      Pr(>|t|)
## (Intercept)   77.459864 1.28864363  60.109608  0.000000e+00
## danceability   6.166644 0.93831761   6.572022  5.037886e-11
## energy       -36.793301 1.02164453 -36.013799 1.367851e-278
## loudness       1.981769 0.06163723  32.152147 2.424536e-223
# Summarize the ANOVA model
anova_summary <- summary(anova_model)
anova_summary
##                   Df   Sum Sq Mean Sq F value Pr(>F)    
## playlist_genre     5   627013  125403   207.2 <2e-16 ***
## Residuals      32827 19866852     605                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Extracting the p-value (single numeric value)
p_value <- anova_summary[[1]][["Pr(>F)"]][1]

# Interpretation of the ANOVA results
if (p_value < 0.05) {
  conclusion <- "Reject the null hypothesis. There is a significant difference in track popularity across genres."
} else {
  conclusion <- "Fail to reject the null hypothesis. There is no significant difference in track popularity across genres."
}

# Print the p-value and conclusion
p_value
## [1] 2.702657e-218
conclusion
## [1] "Reject the null hypothesis. There is a significant difference in track popularity across genres."
# Check the frequency of categories in 'playlist_genre'
genre_counts <- spotify_songs %>%
  count(playlist_genre, sort = TRUE)

# View the top categories (to decide on consolidation)
print(genre_counts)
##   playlist_genre    n
## 1            edm 6043
## 2            rap 5746
## 3            pop 5507
## 4            r&b 5431
## 5          latin 5155
## 6           rock 4951
# Consolidate less frequent genres into 'Other' category if there are more than 10 categories
spotify_songs <- spotify_songs %>%
  mutate(playlist_genre = fct_lump(playlist_genre, n = 10))  # Keep top 10 most frequent genres

# Check the unique categories after consolidation
unique(spotify_songs$playlist_genre)
## [1] pop   rap   rock  latin r&b   edm  
## Levels: edm latin pop r&b rap rock
# Run the ANOVA test after consolidation
anova_model <- aov(track_popularity ~ playlist_genre, data = spotify_songs)

# Summarize the ANOVA model
anova_summary <- summary(anova_model)

# Extracting the p-value (single numeric value)
p_value <- anova_summary[[1]][["Pr(>F)"]][1]

# Interpretation of the ANOVA results
if (p_value < 0.05) {
  conclusion <- "Reject the null hypothesis. There is a significant difference in track popularity across genres."
} else {
  conclusion <- "Fail to reject the null hypothesis. There is no significant difference in track popularity across genres."
}

# Print the p-value and conclusion
p_value
## [1] 2.702657e-218
conclusion
## [1] "Reject the null hypothesis. There is a significant difference in track popularity across genres."

Visualizing the relationship between popularity and danceability

ggplot(spotify_songs, aes(x = danceability, y = track_popularity)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "skyblue") +
  theme_minimal() +
  labs(title = "Relationship between Danceability and Track Popularity",
       x = "Danceability", y = "Track Popularity")
## `geom_smooth()` using formula = 'y ~ x'

Visualizing ANOVA results: Boxplot of popularity by track_genre

ggplot(spotify_songs, aes(x = playlist_genre, y = track_popularity)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Track Popularity by Playlist Genre", x = "Playlist Genre", y = "Track Popularity")

Explain what this might mean for people who may be interested in your data

If the p-value is less than 0.05 (Reject the null hypothesis):

If the p-value is greater than or equal to 0.05 (Fail to reject the null hypothesis):

This conclusion helps stakeholders make data-informed decisions based on the outcome of the ANOVA test and shows them how to interpret the influence of genres on popularity.

ANOVA Test on Categorical Data (Playlist Genre)

Insight:

After performing the ANOVA test on track_popularity based on playlist_genre, we found whether there were statistically significant differences in the mean popularity of songs across different genres. If the p-value was less than 0.05, it indicates that genre plays a significant role in influencing the popularity of a track. Otherwise, if the p-value is greater than 0.05, genre alone does not explain the differences in popularity across the dataset.

Significance:

This insight is crucial for anyone involved in the music industry (producers, marketers, artists). If genre significantly influences popularity, stakeholders can optimize their efforts based on genre trends. For example, focusing on genres that consistently have higher popularity might be a good strategy to maximize reach. If genre does not significantly influence popularity, then factors other than genre (e.g., marketing, artist reputation) might have a bigger impact.

Further Questions:

  1. If genre is significant, which specific genres contribute the most to higher popularity? Can we further investigate this using post-hoc tests or pairwise comparisons between genres?
  2. If genre is not significant, what other variables (like danceability, energy) might play a larger role in determining popularity?

Linear Regression Model Using Danceability

Insight:

The linear regression model we built using danceability as a predictor of track_popularity showed the strength and direction of the relationship between these two variables. For instance, a positive coefficient indicates that as the danceability of a song increases, its predicted popularity also increases. The R-squared value tells us how well danceability alone explains the variance in track_popularity.

Significance:

Understanding the relationship between danceability and popularity helps music producers and artists make informed decisions when creating songs. If the model shows a strong positive relationship, it might suggest that songs with a more rhythmic, danceable quality tend to be more popular. For those aiming to increase song popularity, enhancing the danceability feature could be an effective strategy.

Further Questions:

  1. How does danceability interact with other features (like energy or loudness)? Would a model with multiple explanatory variables provide a better fit and deeper insights into song popularity?
  2. Can this result be generalized across different genres, or is danceability more important for certain genres than others?

Consolidating Categorical Data

Insight:

By consolidating genres into fewer categories, we reduced complexity and made the ANOVA test more interpretable. This step is important when there are many categories that might dilute the overall analysis, especially when less frequent categories might not provide meaningful comparisons.

Significance:

Consolidating categories into the most relevant groups allows for more meaningful statistical analysis. By grouping the top genres and lumping the rest as “Other,” the ANOVA test focuses on key categories, helping stakeholders identify which major genres are most influential. For those in the industry, it simplifies decision-making by highlighting only the most impactful genres, instead of considering too many fragmented categories.

Further Questions:

  1. Should these consolidations be data-driven (based on frequency) or conceptual (based on genre similarities)?
  2. Are the results stable if we use different consolidation strategies (e.g., grouping based on genre characteristics rather than frequency)?

Coefficients Explanation:

Intercept: This value represents the baseline track popularity when all predictor variables (danceability, energy, and loudness) are set to zero. In a practical context, this value may not be realistic, as it’s rare for all these variables to be zero for a song, but it provides a reference level for popularity when considering the influence of each variable.

Danceability Coefficient: This coefficient indicates the change in track popularity for a one-unit increase in danceability, assuming all other variables remain constant. A positive coefficient implies that higher danceability is associated with higher popularity, while a negative coefficient would suggest the opposite.

Energy Coefficient: This coefficient shows how track popularity changes with a one-unit increase in energy, holding other factors constant. If the coefficient is positive, songs with more energy tend to be more popular; a negative value would suggest that higher energy may decrease popularity.

Loudness Coefficient: This coefficient captures the change in track popularity for each additional unit of loudness, with other factors unchanged. A positive coefficient means that louder songs might be more popular, while a negative one would suggest that loudness could decrease popularity.