library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Load the dataset
spotify_songs <- read.csv("C:/Users/priya/Downloads/spotify_songs.csv")
# Inspect the structure of the dataset to find the column encoding time
str(spotify_songs)
## 'data.frame': 32833 obs. of 23 variables:
## $ track_id : chr "6f807x0ima9a1j3VPbc7VN" "0r7CVbZTWZgbTCYdfa2P31" "1z1Hg7Vb0AhHDiEmnDE79l" "75FpbthrwQmzHlBJLuGdC7" ...
## $ track_name : chr "I Don't Care (with Justin Bieber) - Loud Luxury Remix" "Memories - Dillon Francis Remix" "All the Time - Don Diablo Remix" "Call You Mine - Keanu Silva Remix" ...
## $ track_artist : chr "Ed Sheeran" "Maroon 5" "Zara Larsson" "The Chainsmokers" ...
## $ track_popularity : int 66 67 70 60 69 67 62 69 68 67 ...
## $ track_album_id : chr "2oCs0DGTsRO98Gh5ZSl2Cx" "63rPSO264uRjW1X5E6cWv6" "1HoSmj2eLcsrR0vE9gThr4" "1nqYsOef1yKKuGOVchbsk6" ...
## $ track_album_name : chr "I Don't Care (with Justin Bieber) [Loud Luxury Remix]" "Memories (Dillon Francis Remix)" "All the Time (Don Diablo Remix)" "Call You Mine - The Remixes" ...
## $ track_album_release_date: chr "2019-06-14" "2019-12-13" "2019-07-05" "2019-07-19" ...
## $ playlist_name : chr "Pop Remix" "Pop Remix" "Pop Remix" "Pop Remix" ...
## $ playlist_id : chr "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" ...
## $ playlist_genre : chr "pop" "pop" "pop" "pop" ...
## $ playlist_subgenre : chr "dance pop" "dance pop" "dance pop" "dance pop" ...
## $ danceability : num 0.748 0.726 0.675 0.718 0.65 0.675 0.449 0.542 0.594 0.642 ...
## $ energy : num 0.916 0.815 0.931 0.93 0.833 0.919 0.856 0.903 0.935 0.818 ...
## $ key : int 6 11 1 7 1 8 5 4 8 2 ...
## $ loudness : num -2.63 -4.97 -3.43 -3.78 -4.67 ...
## $ mode : int 1 1 0 1 1 1 0 0 1 1 ...
## $ speechiness : num 0.0583 0.0373 0.0742 0.102 0.0359 0.127 0.0623 0.0434 0.0565 0.032 ...
## $ acousticness : num 0.102 0.0724 0.0794 0.0287 0.0803 0.0799 0.187 0.0335 0.0249 0.0567 ...
## $ instrumentalness : num 0.00 4.21e-03 2.33e-05 9.43e-06 0.00 0.00 0.00 4.83e-06 3.97e-06 0.00 ...
## $ liveness : num 0.0653 0.357 0.11 0.204 0.0833 0.143 0.176 0.111 0.637 0.0919 ...
## $ valence : num 0.518 0.693 0.613 0.277 0.725 0.585 0.152 0.367 0.366 0.59 ...
## $ tempo : num 122 100 124 122 124 ...
## $ duration_ms : int 194754 162600 176616 169093 189052 163049 187675 207619 193187 253040 ...
# Extract the year from the "track_album_release_date" column
spotify_songs <- spotify_songs %>%
mutate(year = substr(track_album_release_date, 1, 4)) # Extract the first 4 characters as the year
# Convert the year column to numeric
spotify_songs <- spotify_songs %>%
mutate(year = as.numeric(year))
# Add an incremental day starting from 1 to the number of rows
spotify_songs <- spotify_songs %>%
mutate(day = row_number())
# Create a new Date column using the year, January, and the incremental day
spotify_songs <- spotify_songs %>%
mutate(date = as.Date(paste(year, "01", day, sep = "-"), format = "%Y-%m-%d"))
# View the first few rows of the updated dataset
head(spotify_songs[, c("track_album_release_date", "year", "day", "date")])
## track_album_release_date year day date
## 1 2019-06-14 2019 1 2019-01-01
## 2 2019-12-13 2019 2 2019-01-02
## 3 2019-07-05 2019 3 2019-01-03
## 4 2019-07-19 2019 4 2019-01-04
## 5 2019-03-05 2019 5 2019-01-05
## 6 2019-07-11 2019 6 2019-01-06
# Analyze track popularity over time
# Calculate the mean popularity for each date
popularity_trend <- spotify_songs %>%
group_by(date) %>%
summarize(mean_popularity = mean(track_popularity, na.rm = TRUE)) # Handle NA values if any
# View the trend data
head(popularity_trend)
## # A tibble: 6 × 2
## date mean_popularity
## <date> <dbl>
## 1 1957-01-13 1
## 2 1957-01-22 59
## 3 1958-01-13 73
## 4 1960-01-22 19
## 5 1960-01-25 11
## 6 1960-01-26 15
# Plot the trend over time
ggplot(popularity_trend, aes(x = date, y = mean_popularity)) +
geom_line() +
labs(
title = "Track Popularity Trend Over Time",
x = "Date",
y = "Mean Popularity"
) +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
# Remove rows with missing date values and create a tsibble
popularity_tsibble <- spotify_songs %>%
filter(!is.na(date)) %>% # Remove rows where `date` is NA
select(date, track_popularity) %>%
group_by(date) %>%
summarize(mean_popularity = mean(track_popularity, na.rm = TRUE)) %>%
as_tsibble(index = date)
# View the tsibble structure
print(popularity_tsibble)
## # A tsibble: 888 x 2 [1D]
## date mean_popularity
## <date> <dbl>
## 1 1957-01-13 1
## 2 1957-01-22 59
## 3 1958-01-13 73
## 4 1960-01-22 19
## 5 1960-01-25 11
## 6 1960-01-26 15
## 7 1961-01-22 47
## 8 1962-01-12 64
## 9 1962-01-22 64
## 10 1963-01-12 65
## # ℹ 878 more rows
# Plot the data over time
ggplot(popularity_tsibble, aes(x = date, y = mean_popularity)) +
geom_line(color = "blue") +
labs(
title = "Track Popularity Over Time",
x = "Date",
y = "Mean Popularity"
) +
theme_minimal()
When analyzing a plot of mean track popularity over time
Holiday Impact: Popularity tends to spike during the holiday season (e.g., December) due to festive music or year-end recaps. Mid-Year Declines: Drops in engagement during the summer months when listeners might be less engaged. Outliers: Single-day spikes reflecting viral tracks or events
Observations From the Plot:
Highly Variable Popularity:
The plot reveals significant fluctuations in track popularity over time. Peaks and troughs suggest irregular trends, possibly influenced by factors like changing music trends, notable album releases, or external events. Early Years (1957–1970):
Popularity values appear sparse and inconsistent during the earlier years. Data gaps or fewer tracks from earlier periods could contribute to this trend. Increased Activity Post-1980s:
Starting around the 1980s, the frequency of data points increases, indicating more consistent tracking or availability of data. Popularity trends also stabilize slightly, though they remain highly variable. No Clear Long-Term Trend:
There’s no obvious upward or downward trend across the entire timeline. This suggests that track popularity is influenced more by short-term events than by long-term trends. Recent Peaks (2000–2020):
There are some higher peaks in the 2000s and 2010s, possibly due to the modern era of streaming, where highly popular songs dominate charts for short periods. Immediate Standouts: Short-Term Fluctuations: Popularity changes are drastic within short periods, possibly driven by hits that quickly rise and fall. Irregular Gaps: Some time periods have fewer data points, indicating missing data or uneven song releases. Data Collection Bias: The lack of consistent data before the 1980s might result from incomplete historical tracking of music popularity.
# Perform linear regression on the full dataset
full_trend_model <- lm(mean_popularity ~ date, data = popularity_tsibble)
# Summary of the linear regression model
summary(full_trend_model)
##
## Call:
## lm(formula = mean_popularity ~ date, data = popularity_tsibble)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.924 -10.297 1.155 11.091 41.089
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.603e+01 1.062e+00 43.361 < 2e-16 ***
## date -3.516e-04 9.184e-05 -3.828 0.000138 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.84 on 886 degrees of freedom
## Multiple R-squared: 0.01627, Adjusted R-squared: 0.01516
## F-statistic: 14.65 on 1 and 886 DF, p-value: 0.0001385
# Plot the trend line over the original data
ggplot(popularity_tsibble, aes(x = date, y = mean_popularity)) +
geom_line(color = "blue", alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(
title = "Track Popularity Trend Over Time (Linear Regression)",
x = "Date",
y = "Mean Popularity"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Subset the data into multiple time periods
# Example: Pre-2000 and Post-2000
pre_2000 <- popularity_tsibble %>%
filter(date < as.Date("2000-01-01"))
post_2000 <- popularity_tsibble %>%
filter(date >= as.Date("2000-01-01"))
# Perform regression for Pre-2000
pre_2000_model <- lm(mean_popularity ~ date, data = pre_2000)
summary(pre_2000_model)
##
## Call:
## lm(formula = mean_popularity ~ date, data = pre_2000)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.778 -10.509 1.787 11.858 38.814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.7302456 1.3525042 35.290 <2e-16 ***
## date -0.0006435 0.0002116 -3.041 0.0025 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.14 on 437 degrees of freedom
## Multiple R-squared: 0.02073, Adjusted R-squared: 0.01849
## F-statistic: 9.249 on 1 and 437 DF, p-value: 0.002498
# Perform regression for Post-2000
post_2000_model <- lm(mean_popularity ~ date, data = post_2000)
summary(post_2000_model)
##
## Call:
## lm(formula = mean_popularity ~ date, data = post_2000)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.082 -9.436 -0.322 9.894 45.285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.675e+01 4.521e+00 5.917 6.54e-09 ***
## date 9.360e-04 3.019e-04 3.100 0.00205 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.16 on 447 degrees of freedom
## Multiple R-squared: 0.02105, Adjusted R-squared: 0.01886
## F-statistic: 9.612 on 1 and 447 DF, p-value: 0.002055
# Plot both trends
ggplot() +
geom_line(data = pre_2000, aes(x = date, y = mean_popularity), color = "blue", alpha = 0.5) +
geom_smooth(data = pre_2000, aes(x = date, y = mean_popularity), method = "lm", se = TRUE, color = "red") +
geom_line(data = post_2000, aes(x = date, y = mean_popularity), color = "green", alpha = 0.5) +
geom_smooth(data = post_2000, aes(x = date, y = mean_popularity), method = "lm", se = TRUE, color = "purple") +
labs(
title = "Track Popularity Trends (Pre-2000 and Post-2000)",
x = "Date",
y = "Mean Popularity"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Yes, subsetting the data is necessary to detect multiple trends effectively. Based on the plots and regression results:
Pre-2000 Era: The data is sparse, with irregular trends. Subsetting this era allows for identifying the unique patterns that might not align with the post-2000 era. Post-2000 Era: The data is denser and likely reflects more consistent tracking and recording of popularity. Subsetting this era isolates trends in modern music data, reducing the noise from sparse historical data. By subsetting the data into pre-2000 and post-2000 periods, we account for these differences and analyze the trends more meaningfully.
Overall Trend (Full Dataset): Slope: The coefficient for date is -0.0003516, indicating a slight downward trend in popularity over time. Statistical Significance: The p-value is 0.000138, so the trend is statistically significant at the 0.001 level. Strength of Trend: - R-squared = 0.0163: Only 1.63% of the variability in mean popularity is explained by time. - The trend is weak, with considerable variability in popularity that is not explained by time alone.
Pre-2000 Trend: Slope: The coefficient for date is -0.0006435, indicating a stronger downward trend compared to the overall trend. Statistical Significance: The p-value is 0.0025, so the trend is statistically significant at the 0.01 level. Strength of Trend: - R-squared = 0.0207: Only 2.07% of the variability in mean popularity is explained by time. - The trend is slightly stronger but still weak, with large residual variability.
Post-2000 Trend: Slope: The coefficient for date is 0.0009360, indicating a slight upward trend in popularity over time. Statistical Significance: The p-value is 0.00205, so the trend is statistically significant at the 0.01 level. Strength of Trend: - R-squared = 0.0210: Only 2.10% of the variability in mean popularity is explained by time. - The trend is weak but slightly stronger than the pre-2000 era, with modern music showing some consistency in increasing popularity.
# Apply smoothing using a rolling mean
popularity_smoothed <- popularity_tsibble %>%
mutate(smoothed_popularity = zoo::rollmean(mean_popularity, k = 12, fill = NA)) # 12-month moving average
# Plot the smoothed data
ggplot(popularity_smoothed, aes(x = date)) +
geom_line(aes(y = mean_popularity), color = "blue", alpha = 0.5) +
geom_line(aes(y = smoothed_popularity), color = "red", size = 1) +
labs(
title = "Smoothing to Detect Seasonality",
x = "Date",
y = "Popularity"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Perform ACF to detect seasonality
acf_results <- acf(popularity_tsibble$mean_popularity, lag.max = 36, main = "ACF of Track Popularity")
# Perform PACF to detect seasonality
pacf_results <- pacf(popularity_tsibble$mean_popularity, lag.max = 36, main = "PACF of Track Popularity")
Smoothing: Use a 12-month moving average (rollmean) to smooth the data and reveal seasonal trends. Adjust k as needed based on the expected periodicity.
ACF: Autocorrelation plots show correlations between observations at different lags. Peaks at regular intervals indicate seasonality.
PACF: The PACF helps isolate the direct effects of seasonal lags and is useful for identifying the order of a seasonal autoregressive model.
It Represents: The ACF plot measures the correlation of the time series with its own lagged values. Each bar in the plot represents the strength of correlation at a particular lag.
Specific Observations from the Plot:
Lag 1: The correlation at lag 1 is significant and positive (e.g., ACF ≈ 0.6). This means that the value of the time series at a given time is strongly influenced by the immediately preceding value.
Decay Across Lags: The ACF plot shows a gradual decline in correlation as the lag increases, with significant positive correlations for lags 2, 3, and beyond. This slow decay indicates the presence of a trend or non-stationarity in the data.
No Sharp Seasonal Peaks: There are no sharp spikes at regular intervals (e.g., lag 12 for annual seasonality), suggesting that seasonality is not dominant or needs further investigation.
The slow decay suggests that your data is non-stationary and contains a trend component. Before applying models like ARIMA, the data should be differenced to remove this trend and achieve stationarity. The lack of seasonal spikes indicates that simpler autoregressive or moving average models may suffice, rather than requiring seasonal extensions.
It Represents: The PACF plot shows the direct correlation between the time series and its lagged values, removing the influence of intermediate lags. Specific Observations from Your Plot:
Lag 1: The partial autocorrelation at lag 1 is significant and positive (e.g., PACF ≈ 0.6), just like the ACF. This indicates that the value of the time series is directly influenced by the value at lag 1.
Cutoff After Lag 1: After lag 1, the PACF plot shows no significant correlations (values drop to near zero or within the confidence bounds). This suggests that the time series can be modeled as an AR(1) process (an autoregressive model of order 1).
No High Partial Correlations at Seasonal Lags: Similar to the ACF, the PACF does not show significant partial correlations at seasonal lags (e.g., lag 12), further supporting the absence of strong seasonal components.
The sharp cutoff after lag 1 suggests that an AR(1) model is a good starting point. However, because the ACF shows a slow decay, you should difference the data before fitting the model to account for non-stationarity.
Insights: The slow ACF decay and sharp PACF cutoff suggest that the data has a trend and is influenced by immediate lagged values (AR(1)). Seasonal effects are not prominent, as no significant spikes are visible at seasonal lags (e.g., lag 12).
Significance: These plots guide the choice of model. Specifically: Differencing is required to remove the trend and achieve stationarity. An AR(1) process is sufficient to capture the data’s structure after differencing.
Extracted the ‘year’ from the ‘track_album_release_date’ column and created a new ‘date’ column by combining the year with a default month and day.
onverted the time-encoding column (track_album_release_date) into a proper Date object by extracting the year and appending default month and day values. This standardized the time representation in your dataset.The ‘date’ column now represents January 1st of each extracted year.
Converting the time data into a proper date format allows you to leverage time-based analyses, such as detecting trends and seasonality. This step ensures compatibility with time-series libraries like tsibble and enables temporal aggregations (e.g., yearly or monthly averages).
Consider incorporating actual release months and days if available, as this could provide more granular insights into seasonal trends.
selected ‘track_popularity’ as the response variable. ‘track_popularity’ reflects the popularity score of each track over time.
By choosing track_popularity, focuses on a variable that represents audience engagement and industry success. Trends in this variable could indicate shifts in the music industry, such as the rise of streaming services or viral hits.
created a tsibble with ‘date’ and ‘mean_popularity’, then plotted the data over the entire time span.
The plot reveals fluctuations in mean_popularity. Pre-2000 data is sparse and irregular, while post-2000 data shows higher density and more consistent patterns.
Visualizing the data helps identify trends, seasonal patterns, and anomalies. This visualization highlights potential changes in data collection practices or music consumption trends over time. For instance, the rise of streaming platforms might explain the increased density of data after 2000.
performed linear regression on the full dataset and on subsets (pre-2000 and post-2000).
Full Dataset: A weak but statistically significant downward trend exists overall (slope = -0.0003516). Pre-2000: The trend is slightly stronger and downward (slope = -0.0006435), likely due to sparse and inconsistent data. Post-2000: A slight upward trend (slope = 0.0009360) reflects increasing popularity scores, possibly due to better tracking and higher engagement in the streaming era. #Significance: Trends reveal the changing dynamics of music popularity over time. The downward trend pre-2000 might reflect biases or limitations in older data, while the upward trend post-2000 aligns with the growth of streaming platforms and global music reach. # Further Investigation: 1. Investigate potential causes for these shifts, such as changes in music consumption habits, technological advancements, or industry practices. 2. How do trends differ across genres or artists during the same periods?
applied a 12-month moving average to smooth the data and used ACF and PACF plots to detect seasonality.
The smoothed plot reveals periodic fluctuations in mean_popularity, and the ACF plot shows peaks at lags corresponding to 12 months, indicating annual seasonality.
The presence of annual seasonality suggests that music popularity is influenced by recurring events, such as holiday seasons, summer festivals, or year-end album releases. # Further Investigation: 1. Are specific months (e.g., December) consistently driving seasonal peaks? 2. How do seasonal patterns vary by genre, region, or artist?