Load

# Install necessary packages if not already installed
if (!requireNamespace("feasts", quietly = TRUE)) {
  install.packages("feasts")
}
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
if (!requireNamespace("tsibble", quietly = TRUE)) {
  install.packages("tsibble")
}

# Load the required packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fpp3)
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tsibble     1.1.5     ✔ fable       0.3.4
## ✔ tsibbledata 0.4.1     ✔ fabletools  0.4.2
## ✔ feasts      0.3.2     
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(feasts)
library(tsibble)
library(fabletools)
library(tsfeatures)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## 
## Attaching package: 'tsfeatures'
## 
## The following objects are masked from 'package:feasts':
## 
##     unitroot_kpss, unitroot_pp
# Install the tsibbledata package if not already installed
if (!requireNamespace("tsibbledata", quietly = TRUE)) {
  install.packages("tsibbledata")
}

# Load the dataset
library(tsibbledata)

Get Data

data <- tourism %>% 
  filter(Region == "Melbourne", Purpose == "Holiday") %>% 
  select(Trips)

# Correcting the time column to `Quarter`
ts_data <- tourism %>% 
  filter(Region == "Melbourne", Purpose == "Holiday") %>% 
  as_tsibble(index = Quarter)

Autoplot

# Filtering the data and converting it to a tsibble
ts_data <- tourism %>%
  filter(Region == "Melbourne", Purpose == "Holiday") %>%
  as_tsibble(index = Quarter)

autoplot(ts_data, Trips) +
  ggtitle("Time Plot of Holiday Trips in Melbourne") +
  xlab("Time") +
  ylab("Number of Trips")

gg_season(ts_data, Trips) +
  ggtitle("Seasonal Plot of Holiday Trips in Melbourne") +
  xlab("Quarter") +
  ylab("Trips")

gg_subseries(ts_data, Trips) +
  ggtitle("Subseries Plot of Holiday Trips in Melbourne") +
  xlab("Quarter") +
  ylab("Trips")

gg_lag(ts_data, Trips) +
  ggtitle("Lag Plot of Holiday Trips in Melbourne")

ts_data %>%
  model(STL(Trips ~ season(window = "periodic"))) %>%
  components() %>%
  autoplot() +
  ggtitle("STL Decomposition of Holiday Trips in Melbourne")

GGally::ggpairs(ts_data %>%
  as_tibble() %>%
  select(Trips)) +
  ggtitle("Scatterplot Matrix for Holiday Trips in Melbourne")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

# Plotting ACF and PACF

\[ \text{ACF at lag } k = \rho_k = \frac{\text{Cov}(Y_t, Y_{t-k})}{\text{Var}(Y_t)} = \frac{\sum_{t=k+1}^{T} (Y_t - \bar{Y})(Y_{t-k} - \bar{Y})}{\sum_{t=1}^{T} (Y_t - \bar{Y})^2} \]

\[ \text{PACF at lag } k = \phi_{kk} = \text{the partial correlation between } Y_t \text{ and } Y_{t-k} \text{ after removing the effect of the intervening lags.} \]

PACF=mostly for lags

ACF

Trend: If the ACF shows slow decay (gradually decreasing correlation over lags), this suggests a trend in the data. The trend causes high correlations even for distant lags.

Seasonality: If the ACF shows repeating spikes at regular intervals (lags), it suggests seasonality. For example, if there is a spike every 4 lags, this may indicate quarterly seasonality.

Autoregressive (AR) component: A slow decay in ACF may suggest an autoregressive process, where the current value is a function of previous values.

Moving Average (MA) component: For a pure MA process, the ACF will cut off sharply after the MA order. For example, if you have an MA(1) process, ACF will show a significant correlation at lag 1 and then drop to near-zero for subsequent lags.

# ACF Plot
acf_plot <- ts_data %>%
  ACF(Trips) %>%
  autoplot() + 
  ggtitle("ACF of Holiday Trips in Melbourne")

# PACF Plot
pacf_plot <- ts_data %>%
  PACF(Trips) %>%
  autoplot() + 
  ggtitle("PACF of Holiday Trips in Melbourne")

# Displaying the plots
acf_plot

pacf_plot

Hurst

\[ H = \log_2\left(\frac{E[R(n)]}{E[S(n)]}\right) \]

where:

\(E[R(n)]\) is the expected rescaled range (range of cumulative deviations from the mean), and

\(E[S(n)]\) is the expected standard deviation of the process.

Alternatively, for larger datasets:

\[ H = \frac{\log(\text{Rescaled Range})}{\log(n)} \]

where \(n\) is the time lag or window size.

A Hurst value closer to 0.5 implies the series is a random walk, while closer to 1 suggests long-term positive autocorrelation.

hurst_result <- coef_hurst(ts_data$Trips)
print(hurst_result)
## coef_hurst 
##  0.9417057

Spectral Entropy

The spectral entropy \(H_s\) is given by:

\[ H_s = -\sum_{i=1}^{N} P(f_i) \log P(f_i) \]

where:

  • \(P(f_i)\) is the power spectral density (normalized) at frequency \(f_i\), and
  • \(N\) is the total number of frequency bins.

Higher spectral entropy indicates a more uniform distribution of power across the frequencies, implying higher randomness in the signal.

A value closer to 0 means the series is predictable (trending/seasonal), while a value closer to 1 means the series is difficult to forecast (noisy).

spectral_entropy <- feat_spectral(ts_data$Trips)
print(spectral_entropy)
## spectral_entropy 
##        0.3225375

Box Pierce Tests

\[ Q = n \sum_{k=1}^{m} \hat{\rho}_k^2 \] where \(n\) is the number of observations, \(\hat{\rho}_k\) is the autocorrelation at lag \(k\), and \(m\) is the number of lags considered.

If the p-value is small, the null hypothesis of white noise can be rejected.

box_pierce_test <- box_pierce(ts_data$Trips, lag = 10)
print(box_pierce_test)
##   bp_stat bp_pvalue 
##  306.3728    0.0000

Ljung-Box Test

\[ Q = n(n + 2) \sum_{k=1}^{m} \frac{\hat{\rho}_k^2}{n - k} \]

where \(n\) is the number of observations, \(\hat{\rho}_k\) is the autocorrelation at lag \(k\), and \(m\) is the number of lags considered.

If the p-value is small, the null hypothesis of white noise can be rejected.

ljung_box_test <- ljung_box(ts_data$Trips, lag = 10)
print(ljung_box_test)
##   lb_stat lb_pvalue 
##  335.3801    0.0000

PACF

The sum of squares of partial autocorrelations indicates the extent of dependence in the series.

pacf_features <- feat_pacf(ts_data$Trips)
print(pacf_features)
##       pacf5 diff1_pacf5 diff2_pacf5 
##   0.7755323   0.4156942   0.7941100

KPSS Test

\[ \eta = \frac{1}{T^2} \sum_{t=1}^{T} S_t^2 / \hat{\sigma}^2 \]

where \(S_t\) is the cumulative sum of residuals, \(T\) is the number of observations, and \(\hat{\sigma}^2\) is the long-run variance of the residuals. This will render correctly in R Markdown and provides the formula for the KPSS test statistic. Let me

A small p-value indicates the series is non-stationary.

kpss_test <- unitroot_kpss(ts_data$Trips)
print(kpss_test)
## [1] 1.999271

Unitroot Test (Dickey Fuller)

\[ \tau = \frac{\hat{\phi} - 1}{\text{SE}(\hat{\phi})} \]

where \(\hat{\phi}\) is the estimated autoregressive coefficient from the model \(y_t = \phi y_{t-1} + \epsilon_t\), and \(\text{SE}(\hat{\phi})\) is the standard error of \(\hat{\phi}\).

A small p-value implies the series is non-stationary.

pp_test <- unitroot_pp(ts_data$Trips)
print(pp_test)
## [1] -13.98016

NDIFFs

\[ d = \min \{ d \mid Y_t^{(d)} \text{ is stationary} \} \]

where \(Y_t^{(d)}\) represents the time series after differencing \(d\) times.

The result indicates how many differences are needed for stationarity.

ndiffs_required <- unitroot_ndiffs(ts_data$Trips)
print(ndiffs_required)
## ndiffs 
##      1

VAR-Tiled Mean

\[ \text{VAR-Tiled Mean} = \frac{1}{T} \sum_{i=1}^{k} \bar{Y}_{i} \]

where \(T\) is the total number of tiles, \(k\) is the number of observations in each tile, and \(\bar{Y}_{i}\) is the mean of the time series \(Y_t\) within the \(i\)th tile.

Large variations in tiled means indicate instability in the series.

var_tiled_mean_result <- var_tiled_mean(ts_data$Trips)
print(var_tiled_mean_result)
## var_tiled_mean 
##      0.8595831

Var-Tiled Var

\[ \text{VAR-Tiled VAR} = \frac{1}{T} \sum_{i=1}^{k} \text{Var}(Y_{i}) \]

where \(T\) is the total number of tiles, \(k\) is the number of observations in each tile, and \(\text{Var}(Y_{i})\) is the variance of the time series \(Y_t\) within the \(i\)th tile.

Higher values suggest the series has segments with high variance.

var_tiled_var_result <- var_tiled_var(ts_data$Trips)
print(var_tiled_var_result)
## var_tiled_var 
##    0.02874364

Crossing Points

\[ C = \sum_{t=2}^{n} I(Y_t - \bar{Y})(Y_{t-1} - \bar{Y}) < 0 \]

where:

  • \(Y_t\) is the time series,
  • \(\bar{Y}\) is the mean (or median) of the time series,
  • \(I(\cdot)\) is the indicator function, which equals 1 when the condition is true and 0 otherwise,
  • \(n\) is the total number of observations.

The test counts the number of times the series crosses the mean (or median) from one time point to the next.

A higher number of crossings indicates frequent changes in direction.

crossing_points <- n_crossing_points(ts_data$Trips)
print(crossing_points)
## n_crossing_points 
##                17

Flat Spot

\[ \text{Flat Spot} = \frac{1}{n} \sum_{t=1}^{n} \left( Y_t - \bar{Y} \right)^2 \]

where \(Y_t\) is the time series, \(\bar{Y}\) is the mean of the time series, and \(n\) is the number of observations.

Indicates periods of inactivity or low volatility in the series.

flat_spot <- longest_flat_spot(ts_data$Trips)
print(flat_spot)
## longest_flat_spot 
##                 4

Arch Test

\[ \text{ARCH} = n R^2 \]

where \(n\) is the number of observations, and \(R^2\) is the coefficient of determination from the regression of squared residuals on lagged squared residuals.

A small p-value indicates the presence of ARCH effects (heteroskedasticity).

arch_test <- stat_arch_lm(ts_data$Trips)
print(arch_test)
## stat_arch_lm 
##    0.4376122

Guerrero Test

\[ y_t^{(\lambda)} = \begin{cases} \frac{y_t^\lambda - 1}{\lambda}, & \lambda \neq 0 \\ \log(y_t), & \lambda = 0 \end{cases} \]

The Guerrero test selects \(\lambda\) such that the ratio of the standard deviation to the mean of the seasonally adjusted series is minimized.

The optimal lambda value can be used for transforming the series to stabilize variance.

lambda_optimal <- guerrero(ts_data$Trips)
print(lambda_optimal)
## lambda_guerrero 
##       -0.220697