# Load necessary libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.3
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
# Load the dataset
spotify_songs <- read.csv("C:/Users/priya/Downloads/spotify_songs.csv")
# View a summary of the dataset
summary(spotify_songs)
## track_id track_name track_artist track_popularity
## Length:32833 Length:32833 Length:32833 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 24.00
## Mode :character Mode :character Mode :character Median : 45.00
## Mean : 42.48
## 3rd Qu.: 62.00
## Max. :100.00
## track_album_id track_album_name track_album_release_date
## Length:32833 Length:32833 Length:32833
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## playlist_name playlist_id playlist_genre playlist_subgenre
## Length:32833 Length:32833 Length:32833 Length:32833
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## danceability energy key loudness
## Min. :0.0000 Min. :0.000175 Min. : 0.000 Min. :-46.448
## 1st Qu.:0.5630 1st Qu.:0.581000 1st Qu.: 2.000 1st Qu.: -8.171
## Median :0.6720 Median :0.721000 Median : 6.000 Median : -6.166
## Mean :0.6548 Mean :0.698619 Mean : 5.374 Mean : -6.720
## 3rd Qu.:0.7610 3rd Qu.:0.840000 3rd Qu.: 9.000 3rd Qu.: -4.645
## Max. :0.9830 Max. :1.000000 Max. :11.000 Max. : 1.275
## mode speechiness acousticness instrumentalness
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000000
## 1st Qu.:0.0000 1st Qu.:0.0410 1st Qu.:0.0151 1st Qu.:0.0000000
## Median :1.0000 Median :0.0625 Median :0.0804 Median :0.0000161
## Mean :0.5657 Mean :0.1071 Mean :0.1753 Mean :0.0847472
## 3rd Qu.:1.0000 3rd Qu.:0.1320 3rd Qu.:0.2550 3rd Qu.:0.0048300
## Max. :1.0000 Max. :0.9180 Max. :0.9940 Max. :0.9940000
## liveness valence tempo duration_ms
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 4000
## 1st Qu.:0.0927 1st Qu.:0.3310 1st Qu.: 99.96 1st Qu.:187819
## Median :0.1270 Median :0.5120 Median :121.98 Median :216000
## Mean :0.1902 Mean :0.5106 Mean :120.88 Mean :225800
## 3rd Qu.:0.2480 3rd Qu.:0.6930 3rd Qu.:133.92 3rd Qu.:253585
## Max. :0.9960 Max. :0.9910 Max. :239.44 Max. :517810
# linear regression model with the previous variable
model_previous <- lm(track_popularity ~ danceability, data = spotify_songs)
# Summary of the simple model
summary(model_previous)
##
## Call:
## lm(formula = track_popularity ~ danceability, data = spotify_songs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.024 -18.415 2.934 19.480 57.105
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.1757 0.6361 55.30 <2e-16 ***
## danceability 11.1497 0.9484 11.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.93 on 32831 degrees of freedom
## Multiple R-squared: 0.004192, Adjusted R-squared: 0.004162
## F-statistic: 138.2 on 1 and 32831 DF, p-value: < 2.2e-16
# interaction term between danceability and energy
model_extended <- lm(track_popularity ~ danceability * energy + loudness + duration_ms, data = spotify_songs)
# Summary of the extended model
summary(model_extended)
##
## Call:
## lm(formula = track_popularity ~ danceability * energy + loudness +
## duration_ms, data = spotify_songs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.52 -17.82 2.60 18.99 69.96
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.468e+01 2.441e+00 34.684 <2e-16 ***
## danceability 6.901e+00 3.330e+00 2.072 0.0382 *
## energy -3.232e+01 3.125e+00 -10.343 <2e-16 ***
## loudness 1.776e+00 6.204e-02 28.621 <2e-16 ***
## duration_ms -4.734e-05 2.278e-06 -20.783 <2e-16 ***
## danceability:energy -3.340e+00 4.712e+00 -0.709 0.4785
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.26 on 32827 degrees of freedom
## Multiple R-squared: 0.05751, Adjusted R-squared: 0.05737
## F-statistic: 400.6 on 5 and 32827 DF, p-value: < 2.2e-16
# Visualizing the relationship between 'danceability' and 'track_popularity'
plot1 <- ggplot(spotify_songs, aes(x = danceability, y = track_popularity)) +
geom_point(alpha = 0.4, color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Danceability vs Popularity", x = "Danceability", y = "Popularity")
# Visualizing the interaction effect between 'danceability' and 'energy'
plot2 <- ggplot(spotify_songs, aes(x = danceability, y = track_popularity, color = energy)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", aes(group = energy), se = FALSE) +
labs(title = "Interaction: Danceability and Energy vs Popularity", x = "Danceability", y = "Popularity", color = "Energy")
# Visualizing the effect of 'loudness' on 'track_popularity'
plot3 <- ggplot(spotify_songs, aes(x = loudness, y = track_popularity)) +
geom_point(alpha = 0.4, color = "green") +
geom_smooth(method = "lm", se = FALSE, color = "orange") +
labs(title = "Loudness vs Popularity", x = "Loudness", y = "Popularity")
# Visualizing the effect of 'duration_ms' (integer variable) on 'track_popularity'
plot4 <- ggplot(spotify_songs, aes(x = duration_ms, y = track_popularity)) +
geom_point(alpha = 0.4, color = "purple") +
geom_smooth(method = "lm", se = FALSE, color = "pink") +
labs(title = "Duration (ms) vs Popularity", x = "Duration (ms)", y = "Popularity")
# Checking model diagnostics using diagnostic plots
par(mfrow = c(2, 2))
plot(model_extended)
grid.arrange(plot1, plot2, plot3, plot4, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Why include it: Interaction terms are useful to examine whether the effect of one predictor depends on the level of another predictor. In this case, we are testing whether the relationship between danceability and popularity changes depending on the level of energy. For example, songs with high energy might be more popular when their danceability is also high, suggesting a synergistic effect.
Why include it: Loudness is a continuous variable that may directly affect how popular a song is, as louder songs could attract more attention or fit certain genres that tend to be more popular. We include loudness to test whether it has a significant impact on popularity.
Why include it: The duration of a song could have a significant impact on its popularity, as longer or shorter songs might appeal to different audiences. Including duration_ms allows us to see whether the length of a song is associated with its popularity. This integer variable is particularly relevant for the music industry, where song duration may influence listening patterns or repeat plays.
lm_model <- lm(track_popularity ~ danceability * energy + loudness + duration_ms, data = spotify_songs)
# Diagnostic plots
par(mfrow=c(2, 2)) # Arrange plots in a 2x2 grid
# Residuals vs Fitted Plot (to check linearity and homoscedasticity)
plot(lm_model, which = 1)
# Normal Q-Q Plot (to check normality of residuals)
plot(lm_model, which = 2)
# Scale-Location Plot (to check homoscedasticity)
plot(lm_model, which = 3)
# Residuals vs Leverage Plot (to check for influential points)
plot(lm_model, which = 5)
# checking multicollinearity
vif(lm_model)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## danceability energy loudness duration_ms
## 13.025357 17.831721 1.918335 1.036397
## danceability:energy
## 26.321480
Purpose: This plot helps assess linearity (whether the relationship between predictors and the response is linear) and homoscedasticity (whether the variance of residuals is constant across all levels of fitted values).
What to look for: - The residuals should be randomly scattered around the horizontal axis (fitted values). - No clear pattern (like a curve or funnel shape) should be present.
Interpretation:
No issues: If the residuals are evenly scattered around 0 with no distinct patterns, the linearity assumption holds, and the model’s residuals have constant variance.
Issues: If the plot shows a non-linear pattern (e.g., a curve), it indicates that the model might not be capturing a potential non-linear relationship. A funnel-shaped pattern (increasing or decreasing spread of residuals) would suggest heteroscedasticity (non-constant variance of residuals). Severity and Confidence:
If there are mild deviations from randomness but no strong patterns, the model might still be acceptable with a high level of confidence.
Severe deviations like a strong pattern or funnel shape would suggest the need for transformations or adding non-linear terms. Confidence in meeting assumptions is low in this case.
Purpose: This plot tests the normality assumption of the residuals, which is crucial for hypothesis testing and confidence intervals.
What to look for: - The residuals should lie along the diagonal line.
Interpretation: - No issues: If the residuals follow the diagonal line closely, this suggests that they are normally distributed. - Issues: If there are significant deviations from the line (especially at the tails), it suggests that the residuals are not normally distributed, indicating potential issues such as skewness or heavy tails.
Severity and Confidence: - If the points deviate slightly from the line, particularly in the middle but not at the ends, this might be acceptable, and the normality assumption could still hold. - Severe deviations (especially at the tails) indicate that the normality assumption is violated. Transformations (e.g., log or square root) might be needed to address this issue. Confidence is lower if there is a large departure from normality.
Purpose: This plot also checks for homoscedasticity, meaning the residuals should have a constant spread across fitted values.
What to look for: - The residuals should be randomly scattered around a horizontal line, with roughly equal variance across all fitted values.
Interpretation: - No issues: If the residuals are evenly spread around the horizontal line, the homoscedasticity assumption is met. - Issues: A funnel-shaped pattern (where the spread of residuals increases or decreases as fitted values increase) suggests heteroscedasticity (non-constant variance).
Severity and Confidence: - Slight deviations: If the residuals show a mild increase or decrease in spread, this could be acceptable, with moderate confidence in the homoscedasticity assumption. - Severe funnel shape: Indicates heteroscedasticity, and the model might need adjustments (such as weighted least squares or transforming the response variable). Low confidence if this pattern is observed.
Purpose: This plot identifies influential points that have a large effect on the regression model, potentially distorting the results.
What to look for: - Points that lie outside the Cook’s distance lines (dotted lines) are considered influential.
Interpretation: - No issues: If no points fall outside the Cook’s distance lines, there are no highly influential points. - Issues: If one or more points lie outside the Cook’s distance lines, these points might disproportionately affect the model. They could be outliers or influential observations, requiring further investigation.
Severity and Confidence: - Mild influence: If only one or two points are close to the Cook’s distance lines, the impact might be minor, with high confidence in the model. - Severe influence: If multiple points lie outside Cook’s distance lines, the model is likely skewed by influential observations, requiring removal or separate analysis of these points. Confidence is lower in such cases.
Purpose: This plot is optional but useful for checking autocorrelation in residuals, especially if the data is time-ordered.
What to look for: - The residuals should be randomly scattered across time or the order of data points.
Interpretation: - No issues: If the residuals show no pattern and are randomly scattered, this indicates that autocorrelation is not a problem, and the residuals are independent. - Issues: If there is a pattern (such as a cyclical or wave-like structure), this suggests autocorrelation, meaning the residuals are not independent, and the model may need to account for time-related structures (e.g., using a time series model).
Severity and Confidence: - If the residuals show no pattern, you can be confident that the independence assumption is met. - If a pattern is observed, it suggests a violation of independence, and confidence in the model’s assumptions decreases.
Initially, a simple linear regression model was built with danceability as a predictor for popularity. This provided a baseline understanding of how much danceability alone explains the variation in popularity.
A simple model is often insightful as it highlights the effect of a single key predictor without the interference of other variables. However, single predictors are rarely enough to explain complex outcomes, so adding other relevant variables was the logical next step.
What other factors beyond danceability contribute to popularity? Is there an interaction or combination of factors that could provide a more nuanced understanding?
Interaction Term (danceability * energy): Added to check if the effect of danceability on popularity depends on the level of energy.
Continuous Variable (loudness): Included to evaluate the direct effect of volume on popularity, as louder songs might appeal to certain listeners.
Integer Variable (duration_ms): Included to see if song length affects popularity, as shorter or longer songs might attract different audiences. Multicollinearity Check: Variance Inflation Factor (VIF) was used to assess multicollinearity, with values below 5 indicating acceptable levels of correlation between predictors.
Significance: Adding these terms broadens the scope of the model, allowing it to capture more complex patterns and thus potentially improve predictive accuracy.
Further Questions: Does the addition of each variable improve the model meaningfully? Are there diminishing returns in predictive power as we add more terms?
Insight: This plot tests linearity and homoscedasticity. Random scatter around the horizontal axis suggests that the model captures a linear relationship, while constant variance implies homoscedasticity.
Interpretation: No Issues: If residuals appear evenly scattered, the assumptions are likely met. Issues: A pattern (e.g., a curve) suggests non-linearity, while a funnel shape suggests heteroscedasticity.
Severity and Confidence: Minor Patterns: Mild deviations indicate moderate confidence in the linearity and homoscedasticity assumptions. Clear Patterns: Strong trends lower confidence, suggesting the need for transformations or additional non-linear terms.
Insight: The Q-Q plot checks if residuals follow a normal distribution. Points that fall along the line indicate normality. Interpretation:
No Issues: Points closely following the diagonal line imply normality. Issues: Large deviations, especially at the tails, suggest skewness or heavy tails, indicating non-normality.
Severity and Confidence: Mild Deviations: Slight deviations from normality may be acceptable, with moderate confidence. Large Deviations: Strong deviations suggest the need for transformations, as they indicate violations of normality.
Insight: This plot also evaluates homoscedasticity, expecting residuals to show a constant spread around a horizontal line.
Interpretation: No Issues: If the residuals show an even spread, the assumption is met. Issues: A funnel shape indicates heteroscedasticity, implying that residual variance changes with fitted values.
Severity and Confidence: Minor Spread Changes: Slight deviations are tolerable, with high confidence. Significant Spread Change: A funnel shape implies heteroscedasticity, lowering confidence in meeting the assumption.
Insight: This plot identifies influential points that may disproportionately affect the model’s fit. Points beyond Cook’s distance lines are considered influential.
Interpretation: No Issues: If points lie within Cook’s distance lines, influential observations are minimal. Issues: Points outside the lines indicate high leverage and influence on the model, possibly skewing results.
Severity and Confidence: Few Influential Points: Confidence remains high with only minor influence. Multiple Influential Points: Lowers confidence, suggesting the need to reconsider or transform data.
Insight: Useful for time-ordered data, this plot tests independence of residuals by assessing autocorrelation.
Interpretation: No Issues: Random scatter suggests independent residuals. Issues: Patterns indicate autocorrelation, meaning residuals are dependent over time.
Severity and Confidence: Random Scatter: High confidence in independence. Patterned Residuals: Low confidence, prompting time-series methods or lag variables.
Linearity and Complexity: Including interaction and multiple continuous variables captures a more nuanced view, improving model fit. However, adding too many terms may result in diminishing returns and potential overfitting.
Question: Is the model significantly improved by adding each term, or should we prioritize simpler models?
Homoscedasticity: If issues arise in homoscedasticity, it may be essential to consider transformations.
Question: Does a transformation improve model stability without sacrificing interpretability?
Normality of Residuals: Normality is vital for inference, but slight deviations may not always affect the model significantly.
Question: Are deviations from normality severe enough to warrant transformation, or are they within acceptable limits?
Influential Points: Outliers or influential points can skew the model, and understanding their origin is crucial.
Question: Should influential points be retained, or do they reflect anomalies that misrepresent general trends?
Emphasize that the high VIF values, particularly for danceability and energy (13.03 and 17.83, respectively), indicate substantial multicollinearitye interaction term between danceability and energy exhibits an even higher VIF (26.32), further underscoring the correlation issue. This correlation is likely due to the related nature of these variables in musical contexts (e.g., high-energy songs often also rate high on danceability).
Suggest approaches to mitigate this, such as removing one of the correlated variables or combining them into a single feature that captures their joint effect.
Residuals vs. Fitted Plot: Specify whether the residuals show any patterns, such as a funnel shape or curve. If such patterns are present, they suggest non-linearity or heteroscedasticity .
Q Plot*: Note any significant deviations from the line, especially at the tails, which would indicate issues with normality .
Scale-Lot: Address whether residuals are spread evenly around the line. A visible trend suggests heteroscedasticity, indicating that variance may not be constant across fitted values .
4.Residuals vs. Lev: Identify if there are any points outside the Cook’s distance lines, as they may exert undue influence on the model. If multiple points are present, consider removing them or applying transformations .
2.Discuss how this correlation can skew the regression results and the interpretation of coefficients.