# Load required libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Load the dataset
spotify_data <- read.csv("C:/Users/priya/Downloads/spotify_songs.csv")

# -----------------------------------------------
# Step 1: Exploratory Data Analysis (EDA)
# -----------------------------------------------

# Summary of the dataset
summary(spotify_data)
##    track_id          track_name        track_artist       track_popularity
##  Length:32833       Length:32833       Length:32833       Min.   :  0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 24.00  
##  Mode  :character   Mode  :character   Mode  :character   Median : 45.00  
##                                                           Mean   : 42.48  
##                                                           3rd Qu.: 62.00  
##                                                           Max.   :100.00  
##  track_album_id     track_album_name   track_album_release_date
##  Length:32833       Length:32833       Length:32833            
##  Class :character   Class :character   Class :character        
##  Mode  :character   Mode  :character   Mode  :character        
##                                                                
##                                                                
##                                                                
##  playlist_name      playlist_id        playlist_genre     playlist_subgenre 
##  Length:32833       Length:32833       Length:32833       Length:32833      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   danceability        energy              key            loudness      
##  Min.   :0.0000   Min.   :0.000175   Min.   : 0.000   Min.   :-46.448  
##  1st Qu.:0.5630   1st Qu.:0.581000   1st Qu.: 2.000   1st Qu.: -8.171  
##  Median :0.6720   Median :0.721000   Median : 6.000   Median : -6.166  
##  Mean   :0.6548   Mean   :0.698619   Mean   : 5.374   Mean   : -6.720  
##  3rd Qu.:0.7610   3rd Qu.:0.840000   3rd Qu.: 9.000   3rd Qu.: -4.645  
##  Max.   :0.9830   Max.   :1.000000   Max.   :11.000   Max.   :  1.275  
##       mode         speechiness      acousticness    instrumentalness   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000000  
##  1st Qu.:0.0000   1st Qu.:0.0410   1st Qu.:0.0151   1st Qu.:0.0000000  
##  Median :1.0000   Median :0.0625   Median :0.0804   Median :0.0000161  
##  Mean   :0.5657   Mean   :0.1071   Mean   :0.1753   Mean   :0.0847472  
##  3rd Qu.:1.0000   3rd Qu.:0.1320   3rd Qu.:0.2550   3rd Qu.:0.0048300  
##  Max.   :1.0000   Max.   :0.9180   Max.   :0.9940   Max.   :0.9940000  
##     liveness         valence           tempo         duration_ms    
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00   Min.   :  4000  
##  1st Qu.:0.0927   1st Qu.:0.3310   1st Qu.: 99.96   1st Qu.:187819  
##  Median :0.1270   Median :0.5120   Median :121.98   Median :216000  
##  Mean   :0.1902   Mean   :0.5106   Mean   :120.88   Mean   :225800  
##  3rd Qu.:0.2480   3rd Qu.:0.6930   3rd Qu.:133.92   3rd Qu.:253585  
##  Max.   :0.9960   Max.   :0.9910   Max.   :239.44   Max.   :517810
# Popularity distribution
ggplot(spotify_data, aes(x = track_popularity)) +
  geom_histogram(binwidth = 5, fill = "lightblue", color = "black") +
  labs(title = "Popularity Distribution", x = "Popularity", y = "Count") +
  theme_minimal()

# Correlation heatmap for audio features
audio_features <- spotify_data[, c("energy", "danceability", "acousticness", "track_popularity")]
correlation_matrix <- cor(audio_features)
corrplot(correlation_matrix, method = "color", addCoef.col = "black", tl.col = "black", tl.srt = 45)

# -----------------------------------------------
# Step 2: Hypothesis Testing - Energy vs. Popularity
# -----------------------------------------------

# Box plot for popularity by energy levels
spotify_data <- spotify_data %>%
  mutate(energy_category = cut(energy, breaks = 3, labels = c("Low", "Medium", "High")))

ggplot(spotify_data, aes(x = energy_category, y = track_popularity)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Popularity by Energy Levels", x = "Energy Levels", y = "Popularity") +
  theme_minimal()

# Scatter plot with trendline (Energy vs. Popularity)
ggplot(spotify_data, aes(x = energy, y = track_popularity)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", col = "red") +
  labs(title = "Energy vs. Popularity", x = "Energy", y = "Popularity") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# -----------------------------------------------
# Step 3: Hypothesis Testing - Danceability vs. Popularity
# -----------------------------------------------

# Scatter plot with trendline (Danceability vs. Popularity)
ggplot(spotify_data, aes(x = danceability, y = track_popularity)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_smooth(method = "lm", col = "red") +
  labs(title = "Danceability vs. Popularity", x = "Danceability", y = "Popularity") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# -----------------------------------------------
# Step 4: Linear Regression Analysis
# -----------------------------------------------

# Fit a linear regression model
linear_model <- lm(track_popularity ~ energy + danceability + acousticness, data = spotify_data)

# Summary of the model
model_summary <- summary(linear_model)
print(model_summary)
## 
## Call:
## lm(formula = track_popularity ~ energy + danceability + acousticness, 
##     data = spotify_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.430 -18.369   2.715  19.421  58.897 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   42.7865     1.0086  42.420  < 2e-16 ***
## energy       -11.1670     0.9043 -12.348  < 2e-16 ***
## danceability  10.1326     0.9496  10.671  < 2e-16 ***
## acousticness   4.8866     0.7423   6.583 4.69e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.78 on 32829 degrees of freedom
## Multiple R-squared:  0.01629,    Adjusted R-squared:  0.0162 
## F-statistic: 181.2 on 3 and 32829 DF,  p-value: < 2.2e-16
# Interpreting coefficients
coefficients <- coef(linear_model)
cat("Intercept:", coefficients[1], "\n")
## Intercept: 42.78646
cat("Energy Coefficient:", coefficients[2], "\n")
## Energy Coefficient: -11.16702
cat("Danceability Coefficient:", coefficients[3], "\n")
## Danceability Coefficient: 10.13258
cat("Acousticness Coefficient:", coefficients[4], "\n")
## Acousticness Coefficient: 4.886646
# Check significant coefficients
significant_coeff <- model_summary$coefficients[, 4] < 0.05
cat("Significant Coefficients:\n", names(coefficients)[significant_coeff], "\n")
## Significant Coefficients:
##  (Intercept) energy danceability acousticness
# -----------------------------------------------
# Step 5: Diagnostic Plots
# -----------------------------------------------

# Diagnostic plots for linear regression
par(mfrow = c(2, 2)) # Arrange plots in a 2x2 grid
plot(linear_model)

# Reset plot layout
par(mfrow = c(1, 1))

# -----------------------------------------------
# Step 6: Insights and Conclusion
# -----------------------------------------------

# Key takeaways
cat("\nKey Insights:\n")
## 
## Key Insights:
cat("1. Energy has a weak negative relationship with popularity.\n")
## 1. Energy has a weak negative relationship with popularity.
cat("2. Danceability has a strong positive impact on popularity.\n")
## 2. Danceability has a strong positive impact on popularity.
cat("3. Acousticness negatively impacts popularity, consistent with the data trends.\n")
## 3. Acousticness negatively impacts popularity, consistent with the data trends.
# Final model accuracy (R² value)
cat("Model R²:", model_summary$r.squared, "\n")
## Model R²: 0.01629091
# Save updated dataset for future analysis
write.csv(spotify_data, "spotify_data_updated.csv", row.names = FALSE)

# -----------------------------------------------
# Step 7: Save Plots (Optional)
# -----------------------------------------------

# Save box plot
ggsave("popularity_by_energy_levels.png", 
       ggplot(spotify_data, aes(x = energy_category, y = track_popularity)) +
         geom_boxplot(fill = "lightblue") +
         labs(title = "Popularity by Energy Levels", x = "Energy Levels", y = "Popularity") +
         theme_minimal())
## Saving 7 x 5 in image
# Save scatter plot
ggsave("energy_vs_popularity.png",
       ggplot(spotify_data, aes(x = energy, y = track_popularity)) +
         geom_point(alpha = 0.5) +
         geom_smooth(method = "lm", col = "red") +
         labs(title = "Energy vs. Popularity", x = "Energy", y = "Popularity") +
         theme_minimal())
## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'

Explanations and Interpretations

  1. Popularity Distribution Explanation: The histogram shows the distribution of tracks popularity in the dataset. Interpretation: The majority of tracks have low popularity scores, with a large number of tracks clustered around a popularity score of 0. The distribution is slightly right-skewed, with some tracks achieving high popularity scores.
  2. Correlation Heatmap Explanation: The heatmap illustrates the relationships between energy, danceability, acousticness, and track popularity. Interpretation: Energy has a weak negative correlation (-0.11) with track popularity. Danceability shows a very weak positive correlation (0.06) with track popularity. Acousticness shows a weak positive correlation (0.09) with track popularity.
  3. Popularity by Energy Levels (Box Plot) Explanation: This box plot groups popularity by low, medium, and high energy levels. Interpretation: Tracks with low, medium, and high energy levels show similar median popularity scores. However, tracks with lower energy levels exhibit slightly more variation in popularity.
  4. Energy vs. Popularity (Scatter Plot) Explanation: This scatter plot explores the relationship between energy and track popularity with a linear trendline. Interpretation: The trendline indicates a slight negative relationship between energy and popularity. Higher energy does not guarantee popularity.
  5. Danceability vs. Popularity (Scatter Plot) Explanation: This scatter plot explores the relationship between danceability and popularity with a linear trendline. Interpretation: The trendline suggests a weak positive relationship between danceability and popularity. Tracks that are more danceable are slightly more popular.
  6. Regression Model Output Explanation: The linear regression model evaluates the impact of energy, danceability, and acousticness on track popularity. Interpretation: The coefficients indicate the direction and magnitude of relationships: Energy: A unit increase in energy leads to an average decrease of 11.17 in popularity. Danceability: A unit increase in danceability leads to an average increase of 10.13 in popularity. Acousticness: A unit increase in acousticness leads to an average increase of 4.89 in popularity. All coefficients are statistically significant (p-values < 0.05). The model’s R² value (0.0162) is very low, indicating that only a small portion of the variance in popularity is explained by these features.
  7. Diagnostic Plots Explanation: These plots assess the assumptions of the linear regression model. Interpretation: Residuals vs. Fitted: No strong patterns, suggesting acceptable linearity. Q-Q Plot: Residuals deviate from normality, especially in the tails. Scale-Location: No significant heteroscedasticity detected. Residuals vs. Leverage: No overly influential observations except a few outliers.

Insights

Energy’s Impact: Tracks with higher energy levels tend to have lower popularity, possibly reflecting a listener preference for more mellow or diverse tracks. Danceability’s Importance: Danceability is positively associated with popularity, suggesting that tracks designed for dancing may perform better in terms of popularity. Acousticness and Popularity: Tracks with higher acousticness slightly tend to be more popular, possibly indicating an appreciation for acoustic elements in music. Model Performance: While the model identifies significant relationships, its low R² suggests there are additional unaccounted factors influencing track popularity.

Significance

This analysis helps identify which audio features correlate with track popularity, providing valuable insights for music producers, streaming services, and marketers. For example, producers might focus on increasing danceability to target higher popularity, while understanding that energy alone does not guarantee success.

Conclusion

Danceability has a stronger positive relationship with popularity compared to energy and acousticness. Energy negatively impacts popularity, but the relationship is weak. Acousticness has a slight positive impact, aligning with the data trends. The regression model, while significant, explains only a small portion of the variability in popularity, highlighting the complexity of factors affecting track success.