library(tidyverse)
library(lubridate)
library(forecast)
library(TTR)
library(fpp)
library(tseries)
library(TSstudio)
library(padr)
library(recipes)
library(tidyquant)

Explanation

Hi!! Welcome to my LBB in this LBB i’m gonna use sales dataset from tsdl library and make a Time Series Model using ARIMA, Holt Winter, STLF, and TBATS method. Enjoy!!

Input Data

#devtools::install_github("FinYang/tsdl")
library(tsdl)
tsdl
#> Time Series Data Library: 648 time series  
#> 
#>                        Frequency
#> Subject                 0.1 0.25   1   4   5   6  12  13  52 365 Total
#>   Agriculture             0    0  37   0   0   0   3   0   0   0    40
#>   Chemistry               0    0   8   0   0   0   0   0   0   0     8
#>   Computing               0    0   6   0   0   0   0   0   0   0     6
#>   Crime                   0    0   1   0   0   0   2   1   0   0     4
#>   Demography              1    0   9   2   0   0   3   0   0   2    17
#>   Ecology                 0    0  23   0   0   0   0   0   0   0    23
#>   Finance                 0    0  23   5   0   0  20   0   2   1    51
#>   Health                  0    0   8   0   0   0   6   0   1   0    15
#>   Hydrology               0    0  42   0   0   0  78   1   0   6   127
#>   Industry                0    0   9   0   0   0   2   0   1   0    12
#>   Labour market           0    0   3   4   0   0  17   0   0   0    24
#>   Macroeconomic           0    0  18  33   0   0   5   0   0   0    56
#>   Meteorology             0    0  18   0   0   0  17   0   0  12    47
#>   Microeconomic           0    0  27   1   0   0   7   0   1   0    36
#>   Miscellaneous           0    0   4   0   1   1   3   0   1   0    10
#>   Physics                 0    0  12   0   0   0   4   0   0   0    16
#>   Production              0    0   4  14   0   0  28   1   1   0    48
#>   Sales                   0    0  10   3   0   0  24   0   9   0    46
#>   Sport                   0    1   1   0   0   0   0   0   0   0     2
#>   Transport and tourism   0    0   1   1   0   0  12   0   0   0    14
#>   Tree-rings              0    0  34   0   0   0   1   0   0   0    35
#>   Utilities               0    0   2   1   0   0   8   0   0   0    11
#>   Total                   1    1 300  64   1   1 240   3  16  21   648

we’re gonna use the sales subject with frequency 12

tdsl_sales <- subset(tsdl,12,"Sales")
sales_ts <- tdsl_sales[[2]]

Data Inspections

the data starts from 1965 to 1975 and its already in time series data type so we dont need it to create one

sales_ts
#>      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
#> 1965  38  44  53  49  54  57  51  58  48  44  42  37
#> 1966  42  43  53  49  49  40  40  36  29  31  26  23
#> 1967  29  32  41  44  49  47  46  47  43  45  34  31
#> 1968  35  43  46  46  43  41  44  47  41  40  32  32
#> 1969  34  40  43  42  43  44  39  40  33  32  31  28
#> 1970  34  29  36  42  43  44  44  48  45  44  40  37
#> 1971  45  49  62  62  58  59  64  62  50  52  50  44
#> 1972  51  56  60  65  64  63  63  72  61  65  51  47
#> 1973  54  58  66  63  64  60  53  52  44  40  36  28
#> 1974  36  42  53  53  55  48  47  43  39  33  30  23
#> 1975  29  33  44  54  56  51  51  53  45  45  44  38

Lets check if there is any missing value in our data

anyNA(sales_ts)
#> [1] FALSE

Great! no missing value, we can procced to do seasonality analysis

Seasonality Analysis

sales_ts %>% 
  autoplot()

Decompose and Plot Time Series object to see is there any Seasonal and Trend component from our dataset

sales_ts %>% 
  decompose() %>% 
  autoplot()

based on plot above, the data have a seasonal and there is no trend

Cross Validation

Before training model we need to split dataset into data train and data test, data test consist 24 data (2 years), and data train consist 108 data (Total data - Data Validation)

test_sales <- tail(sales_ts, 24)
train_sales <- head(sales_ts, length(sales_ts) - length(test_sales))

Build Model

Because our time series data have seasonal and trend component, the model will be built by using :

  1. ARIMA
  2. Holt’s Winter
  3. SLTF
  4. TBATS

ARIMA

Modeling

model_arima_ts <- stlm(train_sales, s.window = 12, method = "arima")

Forecast

forecast_arima_ts <- forecast(model_arima_ts, h= 24)

Performance

accuracy(forecast_arima_ts, test_sales)
#>                       ME     RMSE      MAE        MPE      MAPE      MASE
#> Training set -0.03991406 3.194597 2.541129 -0.3374045  5.661865 0.3214076
#> Test set      3.99630621 6.757250 5.996553  7.0250794 13.705715 0.7584573
#>                    ACF1 Theil's U
#> Training set -0.1911220        NA
#> Test set      0.7512761  1.033144
test_forecast(actual = sales_ts, forecast.obj = forecast_arima_ts, train = train_sales, test = test_sales)

Holt Winters

Modeling

model_hw_ts <- HoltWinters(train_sales)

Forecast

forecast_hw_ts <- forecast(model_hw_ts, h= 24)

Performance

accuracy(forecast_hw_ts,test_sales)
#>                       ME      RMSE       MAE         MPE      MAPE      MASE
#> Training set  0.02610279  3.938768  3.041723  0.06028718  7.062391 0.3847238
#> Test set     12.72533372 15.500735 12.725334 28.13742499 28.137425 1.6095284
#>                    ACF1 Theil's U
#> Training set 0.07179727        NA
#> Test set     0.81180612  2.251022
test_forecast(actual = sales_ts, forecast.obj = forecast_hw_ts, train = train_sales, test = test_sales)

STLF

Modeling

model_stlf_ts <- stlf(train_sales,h = 24, s.window = 12)

Forecast

forecast_stlf_ts <- forecast(model_stlf_ts, h= 24)

Performance

accuracy(forecast_stlf_ts,test_sales)
#>                       ME     RMSE      MAE        MPE      MAPE      MASE
#> Training set -0.04259894 3.140330 2.504655 -0.3657999  5.618128 0.3167943
#> Test set      3.45480471 6.451794 5.770927  5.7140000 13.377837 0.7299196
#>                     ACF1 Theil's U
#> Training set -0.05251589        NA
#> Test set      0.75127609  1.011569
test_forecast(actual = sales_ts, forecast.obj = forecast_stlf_ts, train = train_sales, test = test_sales)

TBATS

Modeling

model_tbats_ts <- train_sales %>% tbats(use.box.cox = T,
                                        use.trend = T,
                                        use.damped.trend = T)

Forecast

forecast_tbats_ts <- forecast(model_tbats_ts, h= 24)

Performance

accuracy(forecast_tbats_ts,test_sales)
#>                      ME     RMSE      MAE       MPE      MAPE     MASE
#> Training set -0.2052848 3.249153 2.510653 -0.683961  5.722421 0.317553
#> Test set      7.9117290 9.730246 8.241959 16.250336 17.545153 1.042461
#>                    ACF1 Theil's U
#> Training set 0.01523089        NA
#> Test set     0.76294995  1.380336
test_forecast(actual = sales_ts, forecast.obj = forecast_tbats_ts, train = train_sales, test = test_sales)

From the analysis above, we can conclude that we have sucsesfully forecast our model and found SLTF as the best model with the lowest error (MAE 5.77).

Assumption Test

Auto-Correlation

Box.test(model_stlf_ts$residuals, type = "Ljung-Box", lag = 24)
#> 
#>  Box-Ljung test
#> 
#> data:  model_stlf_ts$residuals
#> X-squared = 26.368, df = 24, p-value = 0.3348

Normality

shapiro.test(x = model_stlf_ts$residuals)
#> 
#>  Shapiro-Wilk normality test
#> 
#> data:  model_stlf_ts$residuals
#> W = 0.98731, p-value = 0.4023
hist(model_stlf_ts$residuals, breaks = 20)

Conclusion

From all of the model, SLTF model is the best model with MAE error 5.77 ana MAPE 13.37.

Based on the assumption check, there is no autocorrelation on our forecast residuals (p-value > 0.05) and our forecast residual are distributed normally (p-value > 0.05) as you can see it in the histogram above.