library(tsibble)

## Warning: package 'tsibble' was built under R version 4.3.3

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## 
## Attaching package: 'tsibble'

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(forecast)

## Warning: package 'forecast' was built under R version 4.3.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# Load the dataset
spotify_songs <- read.csv("C:/Users/priya/Downloads/spotify_songs.csv")

# Inspect the structure of the dataset to find the column encoding time
str(spotify_songs)

## 'data.frame':    32833 obs. of  23 variables:
##  $ track_id                : chr  "6f807x0ima9a1j3VPbc7VN" "0r7CVbZTWZgbTCYdfa2P31" "1z1Hg7Vb0AhHDiEmnDE79l" "75FpbthrwQmzHlBJLuGdC7" ...
##  $ track_name              : chr  "I Don't Care (with Justin Bieber) - Loud Luxury Remix" "Memories - Dillon Francis Remix" "All the Time - Don Diablo Remix" "Call You Mine - Keanu Silva Remix" ...
##  $ track_artist            : chr  "Ed Sheeran" "Maroon 5" "Zara Larsson" "The Chainsmokers" ...
##  $ track_popularity        : int  66 67 70 60 69 67 62 69 68 67 ...
##  $ track_album_id          : chr  "2oCs0DGTsRO98Gh5ZSl2Cx" "63rPSO264uRjW1X5E6cWv6" "1HoSmj2eLcsrR0vE9gThr4" "1nqYsOef1yKKuGOVchbsk6" ...
##  $ track_album_name        : chr  "I Don't Care (with Justin Bieber) [Loud Luxury Remix]" "Memories (Dillon Francis Remix)" "All the Time (Don Diablo Remix)" "Call You Mine - The Remixes" ...
##  $ track_album_release_date: chr  "2019-06-14" "2019-12-13" "2019-07-05" "2019-07-19" ...
##  $ playlist_name           : chr  "Pop Remix" "Pop Remix" "Pop Remix" "Pop Remix" ...
##  $ playlist_id             : chr  "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" ...
##  $ playlist_genre          : chr  "pop" "pop" "pop" "pop" ...
##  $ playlist_subgenre       : chr  "dance pop" "dance pop" "dance pop" "dance pop" ...
##  $ danceability            : num  0.748 0.726 0.675 0.718 0.65 0.675 0.449 0.542 0.594 0.642 ...
##  $ energy                  : num  0.916 0.815 0.931 0.93 0.833 0.919 0.856 0.903 0.935 0.818 ...
##  $ key                     : int  6 11 1 7 1 8 5 4 8 2 ...
##  $ loudness                : num  -2.63 -4.97 -3.43 -3.78 -4.67 ...
##  $ mode                    : int  1 1 0 1 1 1 0 0 1 1 ...
##  $ speechiness             : num  0.0583 0.0373 0.0742 0.102 0.0359 0.127 0.0623 0.0434 0.0565 0.032 ...
##  $ acousticness            : num  0.102 0.0724 0.0794 0.0287 0.0803 0.0799 0.187 0.0335 0.0249 0.0567 ...
##  $ instrumentalness        : num  0.00 4.21e-03 2.33e-05 9.43e-06 0.00 0.00 0.00 4.83e-06 3.97e-06 0.00 ...
##  $ liveness                : num  0.0653 0.357 0.11 0.204 0.0833 0.143 0.176 0.111 0.637 0.0919 ...
##  $ valence                 : num  0.518 0.693 0.613 0.277 0.725 0.585 0.152 0.367 0.366 0.59 ...
##  $ tempo                   : num  122 100 124 122 124 ...
##  $ duration_ms             : int  194754 162600 176616 169093 189052 163049 187675 207619 193187 253040 ...

# Extract the year from the "track_album_release_date" column
spotify_songs <- spotify_songs %>%
  mutate(year = substr(track_album_release_date, 1, 4))  # Extract the first 4 characters as the year

# Convert the year column to numeric
spotify_songs <- spotify_songs %>%
  mutate(year = as.numeric(year))

# Add an incremental day starting from 1 to the number of rows
spotify_songs <- spotify_songs %>%
  mutate(day = row_number())

# Create a new Date column using the year, January, and the incremental day
spotify_songs <- spotify_songs %>%
  mutate(date = as.Date(paste(year, "01", day, sep = "-"), format = "%Y-%m-%d"))

# View the first few rows of the updated dataset
head(spotify_songs[, c("track_album_release_date", "year", "day", "date")])

##   track_album_release_date year day       date
## 1               2019-06-14 2019   1 2019-01-01
## 2               2019-12-13 2019   2 2019-01-02
## 3               2019-07-05 2019   3 2019-01-03
## 4               2019-07-19 2019   4 2019-01-04
## 5               2019-03-05 2019   5 2019-01-05
## 6               2019-07-11 2019   6 2019-01-06

# Analyze track popularity over time
# Calculate the mean popularity for each date
popularity_trend <- spotify_songs %>%
  group_by(date) %>%
  summarize(mean_popularity = mean(track_popularity, na.rm = TRUE))  # Handle NA values if any

# View the trend data
head(popularity_trend)

## # A tibble: 6 × 2
##   date       mean_popularity
##   <date>               <dbl>
## 1 1957-01-13               1
## 2 1957-01-22              59
## 3 1958-01-13              73
## 4 1960-01-22              19
## 5 1960-01-25              11
## 6 1960-01-26              15

# Plot the trend over time
ggplot(popularity_trend, aes(x = date, y = mean_popularity)) +
  geom_line() +
  labs(
    title = "Track Popularity Trend Over Time",
    x = "Date",
    y = "Mean Popularity"
  ) +
  theme_minimal()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# Remove rows with missing date values and create a tsibble
popularity_tsibble <- spotify_songs %>%
  filter(!is.na(date)) %>%  # Remove rows where `date` is NA
  select(date, track_popularity) %>%
  group_by(date) %>%
  summarize(mean_popularity = mean(track_popularity, na.rm = TRUE)) %>%
  as_tsibble(index = date)

# View the tsibble structure
print(popularity_tsibble)

## # A tsibble: 888 x 2 [1D]
##    date       mean_popularity
##    <date>               <dbl>
##  1 1957-01-13               1
##  2 1957-01-22              59
##  3 1958-01-13              73
##  4 1960-01-22              19
##  5 1960-01-25              11
##  6 1960-01-26              15
##  7 1961-01-22              47
##  8 1962-01-12              64
##  9 1962-01-22              64
## 10 1963-01-12              65
## # ℹ 878 more rows

# Plot the data over time
ggplot(popularity_tsibble, aes(x = date, y = mean_popularity)) +
  geom_line(color = "blue") +
  labs(
    title = "Track Popularity Over Time",
    x = "Date",
    y = "Mean Popularity"
  ) +
  theme_minimal()

stand out immediately:

When analyzing a plot of mean track popularity over time

Trends and Patterns: Increasing or Decreasing Popularity: If there’s an upward or downward trend, it indicates changes in overall music preferences or seasonal shifts in popularity. Plateaus: Periods where popularity remains consistent may suggest times of less variation in listener preferences.
Seasonal Effects: Spikes or Dips: Large increases or decreases in popularity might correlate with significant events (e.g., holidays, album releases, or social trends). Regular Cycles: Weekly or monthly cycles could indicate recurring listener habits (e.g., higher engagement on weekends).
Outliers: Unusual Popularity Spikes: Sharp jumps may correspond to viral hits or the release of highly anticipated music. Sudden Drops: A significant decline might result from data gaps or reduced activity during specific periods. Example Observations: For a dataset like this, some common findings:

Holiday Impact: Popularity tends to spike during the holiday season (e.g., December) due to festive music or year-end recaps. Mid-Year Declines: Drops in engagement during the summer months when listeners might be less engaged. Outliers: Single-day spikes reflecting viral tracks or events

Observations From the Plot:

Highly Variable Popularity:

The plot reveals significant fluctuations in track popularity over time. Peaks and troughs suggest irregular trends, possibly influenced by factors like changing music trends, notable album releases, or external events. Early Years (1957–1970):

Popularity values appear sparse and inconsistent during the earlier years. Data gaps or fewer tracks from earlier periods could contribute to this trend. Increased Activity Post-1980s:

Starting around the 1980s, the frequency of data points increases, indicating more consistent tracking or availability of data. Popularity trends also stabilize slightly, though they remain highly variable. No Clear Long-Term Trend:

There’s no obvious upward or downward trend across the entire timeline. This suggests that track popularity is influenced more by short-term events than by long-term trends. Recent Peaks (2000–2020):

There are some higher peaks in the 2000s and 2010s, possibly due to the modern era of streaming, where highly popular songs dominate charts for short periods. Immediate Standouts: Short-Term Fluctuations: Popularity changes are drastic within short periods, possibly driven by hits that quickly rise and fall. Irregular Gaps: Some time periods have fewer data points, indicating missing data or uneven song releases. Data Collection Bias: The lack of consistent data before the 1980s might result from incomplete historical tracking of music popularity.

# Perform linear regression on the full dataset
full_trend_model <- lm(mean_popularity ~ date, data = popularity_tsibble)

# Summary of the linear regression model
summary(full_trend_model)

## 
## Call:
## lm(formula = mean_popularity ~ date, data = popularity_tsibble)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.924 -10.297   1.155  11.091  41.089 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.603e+01  1.062e+00  43.361  < 2e-16 ***
## date        -3.516e-04  9.184e-05  -3.828 0.000138 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.84 on 886 degrees of freedom
## Multiple R-squared:  0.01627,    Adjusted R-squared:  0.01516 
## F-statistic: 14.65 on 1 and 886 DF,  p-value: 0.0001385

# Plot the trend line over the original data
ggplot(popularity_tsibble, aes(x = date, y = mean_popularity)) +
  geom_line(color = "blue", alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(
    title = "Track Popularity Trend Over Time (Linear Regression)",
    x = "Date",
    y = "Mean Popularity"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# Subset the data into multiple time periods
# Example: Pre-2000 and Post-2000
pre_2000 <- popularity_tsibble %>%
  filter(date < as.Date("2000-01-01"))
post_2000 <- popularity_tsibble %>%
  filter(date >= as.Date("2000-01-01"))

# Perform regression for Pre-2000
pre_2000_model <- lm(mean_popularity ~ date, data = pre_2000)
summary(pre_2000_model)

## 
## Call:
## lm(formula = mean_popularity ~ date, data = pre_2000)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.778 -10.509   1.787  11.858  38.814 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 47.7302456  1.3525042  35.290   <2e-16 ***
## date        -0.0006435  0.0002116  -3.041   0.0025 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.14 on 437 degrees of freedom
## Multiple R-squared:  0.02073,    Adjusted R-squared:  0.01849 
## F-statistic: 9.249 on 1 and 437 DF,  p-value: 0.002498

# Perform regression for Post-2000
post_2000_model <- lm(mean_popularity ~ date, data = post_2000)
summary(post_2000_model)

## 
## Call:
## lm(formula = mean_popularity ~ date, data = post_2000)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.082  -9.436  -0.322   9.894  45.285 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.675e+01  4.521e+00   5.917 6.54e-09 ***
## date        9.360e-04  3.019e-04   3.100  0.00205 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.16 on 447 degrees of freedom
## Multiple R-squared:  0.02105,    Adjusted R-squared:  0.01886 
## F-statistic: 9.612 on 1 and 447 DF,  p-value: 0.002055

# Plot both trends
ggplot() +
  geom_line(data = pre_2000, aes(x = date, y = mean_popularity), color = "blue", alpha = 0.5) +
  geom_smooth(data = pre_2000, aes(x = date, y = mean_popularity), method = "lm", se = TRUE, color = "red") +
  geom_line(data = post_2000, aes(x = date, y = mean_popularity), color = "green", alpha = 0.5) +
  geom_smooth(data = post_2000, aes(x = date, y = mean_popularity), method = "lm", se = TRUE, color = "purple") +
  labs(
    title = "Track Popularity Trends (Pre-2000 and Post-2000)",
    x = "Date",
    y = "Mean Popularity"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Do you need to subset the data for multiple trends?

Yes, subsetting the data is necessary to detect multiple trends effectively. Based on the plots and regression results:

Pre-2000 Era: The data is sparse, with irregular trends. Subsetting this era allows for identifying the unique patterns that might not align with the post-2000 era. Post-2000 Era: The data is denser and likely reflects more consistent tracking and recording of popularity. Subsetting this era isolates trends in modern music data, reducing the noise from sparse historical data. By subsetting the data into pre-2000 and post-2000 periods, we account for these differences and analyze the trends more meaningfully.

How strong are these trends?

Overall Trend (Full Dataset): Slope: The coefficient for date is -0.0003516, indicating a slight downward trend in popularity over time. Statistical Significance: The p-value is 0.000138, so the trend is statistically significant at the 0.001 level. Strength of Trend: - R-squared = 0.0163: Only 1.63% of the variability in mean popularity is explained by time. - The trend is weak, with considerable variability in popularity that is not explained by time alone.

Pre-2000 Trend: Slope: The coefficient for date is -0.0006435, indicating a stronger downward trend compared to the overall trend. Statistical Significance: The p-value is 0.0025, so the trend is statistically significant at the 0.01 level. Strength of Trend: - R-squared = 0.0207: Only 2.07% of the variability in mean popularity is explained by time. - The trend is slightly stronger but still weak, with large residual variability.

Post-2000 Trend: Slope: The coefficient for date is 0.0009360, indicating a slight upward trend in popularity over time. Statistical Significance: The p-value is 0.00205, so the trend is statistically significant at the 0.01 level. Strength of Trend: - R-squared = 0.0210: Only 2.10% of the variability in mean popularity is explained by time. - The trend is weak but slightly stronger than the pre-2000 era, with modern music showing some consistency in increasing popularity.

# Apply smoothing using a rolling mean
popularity_smoothed <- popularity_tsibble %>%
  mutate(smoothed_popularity = zoo::rollmean(mean_popularity, k = 12, fill = NA))  # 12-month moving average

# Plot the smoothed data
ggplot(popularity_smoothed, aes(x = date)) +
  geom_line(aes(y = mean_popularity), color = "blue", alpha = 0.5) +
  geom_line(aes(y = smoothed_popularity), color = "red", size = 1) +
  labs(
    title = "Smoothing to Detect Seasonality",
    x = "Date",
    y = "Popularity"
  ) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Perform ACF to detect seasonality
acf_results <- acf(popularity_tsibble$mean_popularity, lag.max = 36, main = "ACF of Track Popularity")

# Perform PACF to detect seasonality
pacf_results <- pacf(popularity_tsibble$mean_popularity, lag.max = 36, main = "PACF of Track Popularity")

Explanation of Code:

Smoothing: Use a 12-month moving average (rollmean) to smooth the data and reveal seasonal trends. Adjust k as needed based on the expected periodicity.

ACF: Autocorrelation plots show correlations between observations at different lags. Peaks at regular intervals indicate seasonality.

PACF: The PACF helps isolate the direct effects of seasonal lags and is useful for identifying the order of a seasonal autoregressive model.

ACF (Autocorrelation Function) Plot: Interpretation

It Represents: The ACF plot measures the correlation of the time series with its own lagged values. Each bar in the plot represents the strength of correlation at a particular lag.

Specific Observations from the Plot:

Lag 1: The correlation at lag 1 is significant and positive (e.g., ACF ≈ 0.6). This means that the value of the time series at a given time is strongly influenced by the immediately preceding value.

Decay Across Lags: The ACF plot shows a gradual decline in correlation as the lag increases, with significant positive correlations for lags 2, 3, and beyond. This slow decay indicates the presence of a trend or non-stationarity in the data.

No Sharp Seasonal Peaks: There are no sharp spikes at regular intervals (e.g., lag 12 for annual seasonality), suggesting that seasonality is not dominant or needs further investigation.

The slow decay suggests that your data is non-stationary and contains a trend component. Before applying models like ARIMA, the data should be differenced to remove this trend and achieve stationarity. The lack of seasonal spikes indicates that simpler autoregressive or moving average models may suffice, rather than requiring seasonal extensions.

PACF (Partial Autocorrelation Function) Plot: Interpretation

It Represents: The PACF plot shows the direct correlation between the time series and its lagged values, removing the influence of intermediate lags. Specific Observations from Your Plot:

Lag 1: The partial autocorrelation at lag 1 is significant and positive (e.g., PACF ≈ 0.6), just like the ACF. This indicates that the value of the time series is directly influenced by the value at lag 1.

Cutoff After Lag 1: After lag 1, the PACF plot shows no significant correlations (values drop to near zero or within the confidence bounds). This suggests that the time series can be modeled as an AR(1) process (an autoregressive model of order 1).

No High Partial Correlations at Seasonal Lags: Similar to the ACF, the PACF does not show significant partial correlations at seasonal lags (e.g., lag 12), further supporting the absence of strong seasonal components.

The sharp cutoff after lag 1 suggests that an AR(1) model is a good starting point. However, because the ACF shows a slow decay, you should difference the data before fitting the model to account for non-stationarity.

Insights: The slow ACF decay and sharp PACF cutoff suggest that the data has a trend and is influenced by immediate lagged values (AR(1)). Seasonal effects are not prominent, as no significant spikes are visible at seasonal lags (e.g., lag 12).

Significance: These plots guide the choice of model. Specifically: Differencing is required to remove the trend and achieve stationarity. An AR(1) process is sufficient to capture the data’s structure after differencing.

Converting Time Data to Date Format

Extracted the ‘year’ from the ‘track_album_release_date’ column and created a new ‘date’ column by combining the year with a default month and day.

Insight

onverted the time-encoding column (track_album_release_date) into a proper Date object by extracting the year and appending default month and day values. This standardized the time representation in your dataset.The ‘date’ column now represents January 1st of each extracted year.

Significance:

Converting the time data into a proper date format allows you to leverage time-based analyses, such as detecting trends and seasonality. This step ensures compatibility with time-series libraries like tsibble and enables temporal aggregations (e.g., yearly or monthly averages).

Further Investigation:

Consider incorporating actual release months and days if available, as this could provide more granular insights into seasonal trends.

Selecting a Response Variable for Time Analysis

selected ‘track_popularity’ as the response variable. ‘track_popularity’ reflects the popularity score of each track over time.

Significance:

By choosing track_popularity, focuses on a variable that represents audience engagement and industry success. Trends in this variable could indicate shifts in the music industry, such as the rise of streaming services or viral hits.

Further Investigation:

Explore other potential response variables, such as ‘danceability’ or ‘energy’, to see if they exhibit different temporal patterns.
Could genre or artist-level analysis provide more granular insights into the factors driving popularity?

Creating a Tsibble and Plotting Data Over Time

created a tsibble with ‘date’ and ‘mean_popularity’, then plotted the data over the entire time span.

The plot reveals fluctuations in mean_popularity. Pre-2000 data is sparse and irregular, while post-2000 data shows higher density and more consistent patterns.

Significance:

Visualizing the data helps identify trends, seasonal patterns, and anomalies. This visualization highlights potential changes in data collection practices or music consumption trends over time. For instance, the rise of streaming platforms might explain the increased density of data after 2000.

Further Investigation:

Zoom into specific periods (e.g., 2000-2010) to detect finer trends or anomalies.
Are there any specific years or periods with sudden spikes or drops? What caused them?

Detecting Trends Using Linear Regression

performed linear regression on the full dataset and on subsets (pre-2000 and post-2000).

Full Dataset: A weak but statistically significant downward trend exists overall (slope = -0.0003516). Pre-2000: The trend is slightly stronger and downward (slope = -0.0006435), likely due to sparse and inconsistent data. Post-2000: A slight upward trend (slope = 0.0009360) reflects increasing popularity scores, possibly due to better tracking and higher engagement in the streaming era. #Significance: Trends reveal the changing dynamics of music popularity over time. The downward trend pre-2000 might reflect biases or limitations in older data, while the upward trend post-2000 aligns with the growth of streaming platforms and global music reach. # Further Investigation: 1. Investigate potential causes for these shifts, such as changes in music consumption habits, technological advancements, or industry practices. 2. How do trends differ across genres or artists during the same periods?

Detecting Seasonality Using Smoothing and ACF/PACF

applied a 12-month moving average to smooth the data and used ACF and PACF plots to detect seasonality.

The smoothed plot reveals periodic fluctuations in mean_popularity, and the ACF plot shows peaks at lags corresponding to 12 months, indicating annual seasonality.

Significance:

The presence of annual seasonality suggests that music popularity is influenced by recurring events, such as holiday seasons, summer festivals, or year-end album releases. # Further Investigation: 1. Are specific months (e.g., December) consistently driving seasonal peaks? 2. How do seasonal patterns vary by genre, region, or artist?

Data Dive — Time-based Data

2024-11-19

stand out immediately:

Do you need to subset the data for multiple trends?

How strong are these trends?

Explanation of Code:

ACF (Autocorrelation Function) Plot: Interpretation

PACF (Partial Autocorrelation Function) Plot: Interpretation

Converting Time Data to Date Format

Insight

Significance:

Further Investigation:

Selecting a Response Variable for Time Analysis

Significance:

Further Investigation:

Creating a Tsibble and Plotting Data Over Time

Significance:

Further Investigation:

Detecting Trends Using Linear Regression

Detecting Seasonality Using Smoothing and ACF/PACF

Significance: