# Install necessary packages if not already installed
if (!requireNamespace("feasts", quietly = TRUE)) {
install.packages("feasts")
}
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
if (!requireNamespace("tsibble", quietly = TRUE)) {
install.packages("tsibble")
}
# Load the required packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fpp3)
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tsibble 1.1.5 ✔ fable 0.3.4
## ✔ tsibbledata 0.4.1 ✔ fabletools 0.4.2
## ✔ feasts 0.3.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(feasts)
library(tsibble)
library(fabletools)
library(tsfeatures)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'tsfeatures'
##
## The following objects are masked from 'package:feasts':
##
## unitroot_kpss, unitroot_pp
# Install the tsibbledata package if not already installed
if (!requireNamespace("tsibbledata", quietly = TRUE)) {
install.packages("tsibbledata")
}
# Load the dataset
library(tsibbledata)
data <- tourism %>%
filter(Region == "Melbourne", Purpose == "Holiday") %>%
select(Trips)
# Correcting the time column to `Quarter`
ts_data <- tourism %>%
filter(Region == "Melbourne", Purpose == "Holiday") %>%
as_tsibble(index = Quarter)
# Filtering the data and converting it to a tsibble
ts_data <- tourism %>%
filter(Region == "Melbourne", Purpose == "Holiday") %>%
as_tsibble(index = Quarter)
autoplot(ts_data, Trips) +
ggtitle("Time Plot of Holiday Trips in Melbourne") +
xlab("Time") +
ylab("Number of Trips")
gg_season(ts_data, Trips) +
ggtitle("Seasonal Plot of Holiday Trips in Melbourne") +
xlab("Quarter") +
ylab("Trips")
gg_subseries(ts_data, Trips) +
ggtitle("Subseries Plot of Holiday Trips in Melbourne") +
xlab("Quarter") +
ylab("Trips")
gg_lag(ts_data, Trips) +
ggtitle("Lag Plot of Holiday Trips in Melbourne")
ts_data %>%
model(STL(Trips ~ season(window = "periodic"))) %>%
components() %>%
autoplot() +
ggtitle("STL Decomposition of Holiday Trips in Melbourne")
GGally::ggpairs(ts_data %>%
as_tibble() %>%
select(Trips)) +
ggtitle("Scatterplot Matrix for Holiday Trips in Melbourne")
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Plotting ACF and PACF
\[ \text{ACF at lag } k = \rho_k = \frac{\text{Cov}(Y_t, Y_{t-k})}{\text{Var}(Y_t)} = \frac{\sum_{t=k+1}^{T} (Y_t - \bar{Y})(Y_{t-k} - \bar{Y})}{\sum_{t=1}^{T} (Y_t - \bar{Y})^2} \]
\[ \text{PACF at lag } k = \phi_{kk} = \text{the partial correlation between } Y_t \text{ and } Y_{t-k} \text{ after removing the effect of the intervening lags.} \]
PACF=mostly for lags
ACF
Trend: If the ACF shows slow decay (gradually decreasing correlation over lags), this suggests a trend in the data. The trend causes high correlations even for distant lags.
Seasonality: If the ACF shows repeating spikes at regular intervals (lags), it suggests seasonality. For example, if there is a spike every 4 lags, this may indicate quarterly seasonality.
Autoregressive (AR) component: A slow decay in ACF may suggest an autoregressive process, where the current value is a function of previous values.
Moving Average (MA) component: For a pure MA process, the ACF will cut off sharply after the MA order. For example, if you have an MA(1) process, ACF will show a significant correlation at lag 1 and then drop to near-zero for subsequent lags.
# ACF Plot
acf_plot <- ts_data %>%
ACF(Trips) %>%
autoplot() +
ggtitle("ACF of Holiday Trips in Melbourne")
# PACF Plot
pacf_plot <- ts_data %>%
PACF(Trips) %>%
autoplot() +
ggtitle("PACF of Holiday Trips in Melbourne")
# Displaying the plots
acf_plot
pacf_plot
\[ H = \log_2\left(\frac{E[R(n)]}{E[S(n)]}\right) \]
where:
\(E[R(n)]\) is the expected rescaled range (range of cumulative deviations from the mean), and
\(E[S(n)]\) is the expected standard deviation of the process.
Alternatively, for larger datasets:
\[ H = \frac{\log(\text{Rescaled Range})}{\log(n)} \]
where \(n\) is the time lag or window size.
A Hurst value closer to 0.5 implies the series is a random walk, while closer to 1 suggests long-term positive autocorrelation.
hurst_result <- coef_hurst(ts_data$Trips)
print(hurst_result)
## coef_hurst
## 0.9417057
The spectral entropy \(H_s\) is given by:
\[ H_s = -\sum_{i=1}^{N} P(f_i) \log P(f_i) \]
where:
Higher spectral entropy indicates a more uniform distribution of power across the frequencies, implying higher randomness in the signal.
A value closer to 0 means the series is predictable (trending/seasonal), while a value closer to 1 means the series is difficult to forecast (noisy).
spectral_entropy <- feat_spectral(ts_data$Trips)
print(spectral_entropy)
## spectral_entropy
## 0.3225375
\[ Q = n \sum_{k=1}^{m} \hat{\rho}_k^2 \] where \(n\) is the number of observations, \(\hat{\rho}_k\) is the autocorrelation at lag \(k\), and \(m\) is the number of lags considered.
If the p-value is small, the null hypothesis of white noise can be rejected.
box_pierce_test <- box_pierce(ts_data$Trips, lag = 10)
print(box_pierce_test)
## bp_stat bp_pvalue
## 306.3728 0.0000
\[ Q = n(n + 2) \sum_{k=1}^{m} \frac{\hat{\rho}_k^2}{n - k} \]
where \(n\) is the number of observations, \(\hat{\rho}_k\) is the autocorrelation at lag \(k\), and \(m\) is the number of lags considered.
If the p-value is small, the null hypothesis of white noise can be rejected.
ljung_box_test <- ljung_box(ts_data$Trips, lag = 10)
print(ljung_box_test)
## lb_stat lb_pvalue
## 335.3801 0.0000
The sum of squares of partial autocorrelations indicates the extent of dependence in the series.
pacf_features <- feat_pacf(ts_data$Trips)
print(pacf_features)
## pacf5 diff1_pacf5 diff2_pacf5
## 0.7755323 0.4156942 0.7941100
\[ \eta = \frac{1}{T^2} \sum_{t=1}^{T} S_t^2 / \hat{\sigma}^2 \]
where \(S_t\) is the cumulative sum of residuals, \(T\) is the number of observations, and \(\hat{\sigma}^2\) is the long-run variance of the residuals. This will render correctly in R Markdown and provides the formula for the KPSS test statistic. Let me
A small p-value indicates the series is non-stationary.
kpss_test <- unitroot_kpss(ts_data$Trips)
print(kpss_test)
## [1] 1.999271
\[ \tau = \frac{\hat{\phi} - 1}{\text{SE}(\hat{\phi})} \]
where \(\hat{\phi}\) is the estimated autoregressive coefficient from the model \(y_t = \phi y_{t-1} + \epsilon_t\), and \(\text{SE}(\hat{\phi})\) is the standard error of \(\hat{\phi}\).
A small p-value implies the series is non-stationary.
pp_test <- unitroot_pp(ts_data$Trips)
print(pp_test)
## [1] -13.98016
\[ d = \min \{ d \mid Y_t^{(d)} \text{ is stationary} \} \]
where \(Y_t^{(d)}\) represents the time series after differencing \(d\) times.
The result indicates how many differences are needed for stationarity.
ndiffs_required <- unitroot_ndiffs(ts_data$Trips)
print(ndiffs_required)
## ndiffs
## 1
\[ \text{VAR-Tiled Mean} = \frac{1}{T} \sum_{i=1}^{k} \bar{Y}_{i} \]
where \(T\) is the total number of tiles, \(k\) is the number of observations in each tile, and \(\bar{Y}_{i}\) is the mean of the time series \(Y_t\) within the \(i\)th tile.
Large variations in tiled means indicate instability in the series.
var_tiled_mean_result <- var_tiled_mean(ts_data$Trips)
print(var_tiled_mean_result)
## var_tiled_mean
## 0.8595831
\[ \text{VAR-Tiled VAR} = \frac{1}{T} \sum_{i=1}^{k} \text{Var}(Y_{i}) \]
where \(T\) is the total number of tiles, \(k\) is the number of observations in each tile, and \(\text{Var}(Y_{i})\) is the variance of the time series \(Y_t\) within the \(i\)th tile.
Higher values suggest the series has segments with high variance.
var_tiled_var_result <- var_tiled_var(ts_data$Trips)
print(var_tiled_var_result)
## var_tiled_var
## 0.02874364
\[ C = \sum_{t=2}^{n} I(Y_t - \bar{Y})(Y_{t-1} - \bar{Y}) < 0 \]
where:
The test counts the number of times the series crosses the mean (or median) from one time point to the next.
A higher number of crossings indicates frequent changes in direction.
crossing_points <- n_crossing_points(ts_data$Trips)
print(crossing_points)
## n_crossing_points
## 17
\[ \text{Flat Spot} = \frac{1}{n} \sum_{t=1}^{n} \left( Y_t - \bar{Y} \right)^2 \]
where \(Y_t\) is the time series, \(\bar{Y}\) is the mean of the time series, and \(n\) is the number of observations.
Indicates periods of inactivity or low volatility in the series.
flat_spot <- longest_flat_spot(ts_data$Trips)
print(flat_spot)
## longest_flat_spot
## 4
\[ \text{ARCH} = n R^2 \]
where \(n\) is the number of observations, and \(R^2\) is the coefficient of determination from the regression of squared residuals on lagged squared residuals.
A small p-value indicates the presence of ARCH effects (heteroskedasticity).
arch_test <- stat_arch_lm(ts_data$Trips)
print(arch_test)
## stat_arch_lm
## 0.4376122
\[ y_t^{(\lambda)} = \begin{cases} \frac{y_t^\lambda - 1}{\lambda}, & \lambda \neq 0 \\ \log(y_t), & \lambda = 0 \end{cases} \]
The Guerrero test selects \(\lambda\) such that the ratio of the standard deviation to the mean of the seasonally adjusted series is minimized.
The optimal lambda value can be used for transforming the series to stabilize variance.
lambda_optimal <- guerrero(ts_data$Trips)
print(lambda_optimal)
## lambda_guerrero
## -0.220697