library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)

# time series toolkits
library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(tsibble)
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:zoo':
## 
##     index
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
data = read.csv("/Users/yashuvaishu/Downloads/Spotify...csv")
set.seed(42)
data$date_c <- seq(as.Date("2010-01-01"), by = "days", length.out = nrow(data))
data1 = head(data,4000)
data_ <- data1 |>
  select(date_c,danceability) |>
  distinct()

converting this tibble into a tsibble with as_tsibble, so we can leverage time series functionality. To create a consistent and regular, I used index_by so that I can use it with group_by.

data_ts <- as_tsibble(data_, index=date_c) |>
  index_by(date = date(date_c)) |>
  fill_gaps()

creating a separate “xts” time series data frame for statistical modeling.

data_xts <- xts(x = data_ts$danceability, 
                  order.by = data_ts$date_c)

data_xts <- setNames(data_xts, "danceability_s")

I used the rollapply function on the xts object, and define a mean which ignores missing values.

data_xts %>%
  rollapply(width = 30, \(x) mean(x, na.rm = TRUE), fill = FALSE) %>%
  ggplot(mapping = aes(x = Index, y = danceability_s)) +
  geom_line() +
  labs(title = "danceabilitys Over Time") +
  theme_hc()

Different line is fit for each window, and the resulting curve models the slopes for each of those lines. This method is specifically designed for plotting data, so you will likely only use it (as we have already done) with geom_smooth. In this way, the default method is “loess” anyway.

data_ts |>
  filter_index("2015" ~ "2020") |>
  ggplot(mapping = aes(x = date_c, y = danceability)) +
  geom_point(size=1, shape='O') +
  geom_smooth(span=0.2, color = 'blue', se=FALSE) +
  labs(title = "danceability's During 2015 to 2020") +
  theme_hc()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Above graph here shows almost a constant relation but there is slight decrease in danceability across 2015 and 2020 years

data_ts |>
  filter_index("2016-01" ~ "2017-01") |>
  ggplot(mapping = aes(x = date_c, y = danceability)) +
  geom_line() +
  geom_smooth(method = 'lm', color = 'blue', se=FALSE) +
  labs(title = "danceability between 2016 to 2017") +
  theme_hc()
## `geom_smooth()` using formula = 'y ~ x'

data_ts |>
  index_by(year = floor_date(date, 'half')) |>
  summarise(avg_danceability = mean(danceability, na.rm = TRUE)) |>
  ggplot(mapping = aes(x = year, y = avg_danceability)) +
  geom_line() +
  geom_smooth(span = 0.3, color = 'blue', se=FALSE, ) +
  labs(title = "Average Earthquakes Over Time",
       subtitle = "(by half year)") +
  scale_x_date(breaks = "1 year", labels = \(x) year(x)) +
  theme_hc()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Here we can see that there is both peak and downfall twice atleast every year.

data_ts |>
  ggplot(mapping = aes(x = date, y = danceability)) +
  geom_line() +
  labs(title = "danceability over time") +
  theme_hc()

This plot illustrate the relation between the danceability and date over 2010 to 2020.

# Plot ACF and PACF to illustrate seasonality
data_ts <- na.omit(data_ts)
your_acf <- acf(data_ts$danceability, lag.max = 30)

your_pacf <- pacf(data_ts$danceability, lag.max = 30)

# Plot ACF
autoplot(your_acf) +
  labs(title = "Autocorrelation Function (ACF)",
       x = "Lag",
       y = "ACF")

# Plot PACF
autoplot(your_pacf) +
  labs(title = "Partial Autocorrelation Function (PACF)",
       x = "Lag",
       y = "PACF")

PACF - The PACF is a measure of the partial correlation between the current value of the time series and its lagged values, after controlling for the effects of the intermediate lagged values. A high PACF value at a particular lag indicates that there is a strong partial correlation between the current value of the time series and its value at that lag, after controlling for the effects of the intermediate lagged values.

The PACF plot decays quickly after lag 14, which suggests that there is no significant long-term trend in the time series, after controlling for the effects of the seasonal components.