1. Import packages

library(M4comp2018)
library(xts)
library(astsa)
library(ggplot2)
library(forecast)
library(ggfortify)
library(fpp2)

2. Load M4 data

data(M4)

3. Create a data frame to sumarize the M4 data structure

df = data.frame(matrix(ncol = 5, nrow = 100000))
colnames(df) = c("st", "n", "type", "h", "period")
df$st = unlist(Map(function(l) {as.character(l$st[[1]][1])}, M4))
df$n = unlist(Map(function(l) {c(l$n[[1]][1]) }, M4))
df$type = unlist(Map(function(l) {as.character(l$type[[1]][1])}, M4))
df$h = unlist(Map(function(l) {c(l$h[[1]][1]) }, M4))
df$period = unlist(Map(function(l) {as.character(l$period[[1]][1])}, M4))
str(df)
## 'data.frame':    100000 obs. of  5 variables:
##  $ st    : chr  "D1" "D2" "D3" "D4" ...
##  $ n     : int  1006 1006 130 169 156 1006 1006 999 999 674 ...
##  $ type  : chr  "Macro" "Macro" "Macro" "Macro" ...
##  $ h     : int  14 14 14 14 14 14 14 14 14 14 ...
##  $ period: chr  "Daily" "Daily" "Daily" "Daily" ...
table(df$period)
## 
##     Daily    Hourly   Monthly Quarterly    Weekly    Yearly 
##      4227       414     48000     24000       359     23000

4. Extract M4 data: yearly, quarterly, monthly, dayly, hourly

yearly_M4 = Filter(function(l) l$period == "Yearly", M4)
quarterly_M4 = Filter(function(l) l$period == "Quarterly", M4)
monthly_M4 = Filter(function(l) l$period == "Monthly", M4)
weekly_M4 = Filter(function(l) l$period == "Weekly", M4)
hourly_M4 = Filter(function(l) l$period == "Hourly", M4)
daily_M4 = Filter(function(l) l$period == "Daily", M4)

5. Plot the first sample

(including n_ahead data in red)

6. Extract one sample month to perform statistical models

Extract the first month: full, training, and test set

monthly_1_full = ts(c(monthly_M4[[1]]$x, monthly_M4[[1]]$xx),
               start=start(monthly_M4[[1]]$x), 
               frequency = frequency(monthly_M4[[1]]$x))
monthly_1_train = ts(monthly_M4[[1]]$x, 
                  start=start(monthly_M4[[1]]$x), 
                  frequency = frequency(monthly_M4[[1]]$x))
monthly_1_test = ts(monthly_M4[[1]]$xx, 
                  start=start(monthly_M4[[1]]$xx), 
                  frequency = frequency(monthly_M4[[1]]$xx))

Explore the structure of the training and test set

head(monthly_1_train)
##       Jun  Jul  Aug  Sep  Oct  Nov
## 1976 8000 8350 8570 7700 7080 6520
str(monthly_1_train)
##  Time-Series [1:469] from 1976 to 2015: 8000 8350 8570 7700 7080 6520 6070 6650 6830 5710 ...
head(monthly_1_test)
##       Jul  Aug  Sep  Oct  Nov  Dec
## 2015 8720 7790 4770 5060 4720 4450
str(monthly_1_test)
##  Time-Series [1:18] from 2016 to 2017: 8720 7790 4770 5060 4720 4450 5120 5960 6560 4900 ...

plot the monthly sample: including full, training and test set

plot(ts(monthly_1_full,
        start = start(monthly_1_full),
        frequency = frequency(monthly_1_full)),
     type = 'l', col = 'black', ylab ='', xlab ='')
lines(monthly_1_test, col = 'red', type = 'o', xlab = '')

Produce a polar coordinate season plot

ggseasonplot(monthly_1_train, polar = T)

Create subseries plot that comprises mini time plots for each season

ggsubseriesplot(monthly_1_train)

### Plot the monthly sample: Removing trend

autoplot(diff(monthly_1_train))

## 7. Explore time series patterns: Trend, seasonal, or cyclic

acf2(diff(monthly_1_train))

##         ACF  PACF
##  [1,]  0.07  0.07
##  [2,] -0.12 -0.12
##  [3,] -0.35 -0.34
##  [4,] -0.25 -0.26
##  [5,]  0.19  0.14
##  [6,]  0.04 -0.16
##  [7,]  0.13  0.00
##  [8,] -0.25 -0.28
##  [9,] -0.32 -0.35
## [10,] -0.09 -0.29
## [11,]  0.20 -0.06
## [12,]  0.62  0.37
## [13,]  0.19  0.26
## [14,] -0.13  0.11
## [15,] -0.31  0.08
## [16,] -0.20  0.03
## [17,]  0.14  0.01
## [18,]  0.07 -0.07
## [19,]  0.11  0.08
## [20,] -0.25 -0.02
## [21,] -0.31 -0.07
## [22,] -0.07 -0.06
## [23,]  0.19 -0.09
## [24,]  0.59  0.16
## [25,]  0.16  0.06
## [26,] -0.12  0.01
## [27,] -0.28  0.06
## [28,] -0.20  0.00
## [29,]  0.13 -0.02
## [30,]  0.04 -0.13
## [31,]  0.11  0.01
## [32,] -0.24 -0.04
## [33,] -0.26  0.04
## [34,] -0.11 -0.10
## [35,]  0.19 -0.07
## [36,]  0.57  0.13
## [37,]  0.15  0.04
## [38,] -0.12 -0.05
## [39,] -0.30 -0.02
## [40,] -0.20 -0.10
## [41,]  0.14 -0.02
## [42,]  0.04 -0.09
## [43,]  0.12  0.03
## [44,] -0.25 -0.07
## [45,] -0.25  0.03
## [46,] -0.09 -0.06
## [47,]  0.16 -0.08
## [48,]  0.54  0.01

8. Ljung-Box test: Finding h autocorrelation

Box.test(monthly_1_train, fitdf = 0, lag = 24, type = 'Lj')
## 
##  Box-Ljung test
## 
## data:  monthly_1_train
## X-squared = 4388.7, df = 24, p-value < 2.2e-16
Box.test(diff(monthly_1_train), fitdf = 0, lag = 24, type = 'Lj')
## 
##  Box-Ljung test
## 
## data:  diff(monthly_1_train)
## X-squared = 786.08, df = 24, p-value < 2.2e-16

The Ljung-Box test result (p-value <<< 0.05) shows autocorrelation in both original data and differencing data.

9. Employ statistical model to forecast

9.1. Seasonal Naive model

Model summary and plot

snaive_f = snaive(diff(monthly_1_train), h = 18)
autoplot(snaive_f) 

summary(snaive_f)
## 
## Forecast method: Seasonal naive method
## 
## Model Information:
## Call: snaive(y = diff(monthly_1_train), h = 18) 
## 
## Residual sd: 727.3591 
## 
## Error measures:
##                      ME     RMSE      MAE MPE MAPE MASE       ACF1
## Training set -0.9429825 726.5617 565.8114 NaN  Inf    1 -0.3595435
## 
## Forecasts:
##          Point Forecast      Lo 80      Hi 80       Lo 95      Hi 95
## Jul 2015           1920   988.8737 2851.12627   495.96526 3344.03474
## Aug 2015          -1390 -2321.1263 -458.87373 -2814.03474   34.03474
## Sep 2015          -1930 -2861.1263 -998.87373 -3354.03474 -505.96526
## Oct 2015           -410 -1341.1263  521.12627 -1834.03474 1014.03474
## Nov 2015            580  -351.1263 1511.12627  -844.03474 2004.03474
## Dec 2015           -480 -1411.1263  451.12627 -1904.03474  944.03474
## Jan 2016            300  -631.1263 1231.12627 -1124.03474 1724.03474
## Feb 2016           -910 -1841.1263   21.12627 -2334.03474  514.03474
## Mar 2016            -40  -971.1263  891.12627 -1464.03474 1384.03474
## Apr 2016            120  -811.1263 1051.12627 -1304.03474 1544.03474
## May 2016           -300 -1231.1263  631.12627 -1724.03474 1124.03474
## Jun 2016           1980  1048.8737 2911.12627   555.96526 3404.03474
## Jul 2016           1920   603.1886 3236.81140   -93.88924 3933.88924
## Aug 2016          -1390 -2706.8114  -73.18860 -3403.88924  623.88924
## Sep 2016          -1930 -3246.8114 -613.18860 -3943.88924   83.88924
## Oct 2016           -410 -1726.8114  906.81140 -2423.88924 1603.88924
## Nov 2016            580  -736.8114 1896.81140 -1433.88924 2593.88924
## Dec 2016           -480 -1796.8114  836.81140 -2493.88924 1533.88924

Fitted values and residuals

autoplot(snaive_f, series = "Train")

Check residuals whether white noise or not

checkresiduals(snaive_f)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 225.16, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24

Accuracy of Seasonal Naive model

accuracy(snaive_f, diff(monthly_1_test))
##                       ME     RMSE      MAE      MPE     MAPE     MASE
## Training set  -0.9429825 726.5617 565.8114      NaN      Inf 1.000000
## Test set     -30.5882353 815.6845 632.9412 80.09394 97.85567 1.118643
##                    ACF1 Theil's U
## Training set -0.3595435        NA
## Test set     -0.1157613 0.5601452

9. Seasonal Arima model

Model summary and plot

sarima_f = auto.arima(monthly_1_train)
summary(sarima_f)
## Series: monthly_1_train 
## ARIMA(1,0,5)(0,1,1)[12] 
## 
## Coefficients:
##          ar1      ma1      ma2      ma3     ma4     ma5     sma1
##       0.9791  -0.3524  -0.0601  -0.1436  0.0074  0.1266  -0.7895
## s.e.  0.0110   0.0489   0.0506   0.0520  0.0520  0.0436   0.0338
## 
## sigma^2 estimated as 268035:  log likelihood=-3507.04
## AIC=7030.07   AICc=7030.39   BIC=7063.07
## 
## Training set error measures:
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -3.835782 507.1257 389.5809 -0.5104896 6.365607 0.5128426
##                       ACF1
## Training set -0.0007186712
sarima_f %>% 
  forecast(h = 18) %>%
  autoplot()

Check residuals whether white noise or not

checkresiduals(sarima_f)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,5)(0,1,1)[12]
## Q* = 30.259, df = 17, p-value = 0.02454
## 
## Model df: 7.   Total lags used: 24

Accuracy of Seasonal Arima model

accuracy(forecast(sarima_f, h = 18), monthly_1_test, d = 0, D = 1)
##                       ME     RMSE      MAE        MPE      MAPE      MASE
## Training set   -3.835782 507.1257 389.5809 -0.5104896  6.365607 0.5128426
## Test set     -111.744545 774.8113 687.0752 -4.4136328 12.765260 0.9044630
##                       ACF1 Theil's U
## Training set -0.0007186712        NA
## Test set      0.4626465094 0.6106427