Background

This data tell us about sales of Monthly gasoline demand Ontario gallon millions 1960 – 1975, and will compare some model also evaluation model. The error numbers will be right model can be used to predicting model time series forecasting. In this modelling, we will use Triple Exponential Smooting/Holt Winters Exponential Smoothing and SARIMA model

# install.packages("devtools")
# devtools::install_github("FinYang/tsdl")
# attributes(tsdl[[4]])

Import library

library(tidyverse) 
## Warning: replacing previous import 'vctrs::data_frame' by 'tibble::data_frame'
## when loading 'dplyr'
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 4.0.2
## Warning: package 'purrr' was built under R version 4.0.2
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate) # date manipulation
## Warning: package 'lubridate' was built under R version 4.0.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(forecast) # time series library
## Warning: package 'forecast' was built under R version 4.0.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(TTR) # for Simple moving average function
## Warning: package 'TTR' was built under R version 4.0.2
library(MLmetrics) # calculate error
## Warning: package 'MLmetrics' was built under R version 4.0.2
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
library(tseries) # adf.test
## Warning: package 'tseries' was built under R version 4.0.2
library(fpp) # usconsumtion
## Warning: package 'fpp' was built under R version 4.0.2
## Loading required package: fma
## Warning: package 'fma' was built under R version 4.0.2
## Loading required package: expsmooth
## Warning: package 'expsmooth' was built under R version 4.0.2
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.0.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(TSstudio) # mempercantik visualisasi timeseries
## Warning: package 'TSstudio' was built under R version 4.0.2

Data Pre processing

library(tsdl)
library(forecast)
tsdl
## Time Series Data Library: 648 time series  
## 
##                        Frequency
## Subject                 0.1 0.25   1   4   5   6  12  13  52 365 Total
##   Agriculture             0    0  37   0   0   0   3   0   0   0    40
##   Chemistry               0    0   8   0   0   0   0   0   0   0     8
##   Computing               0    0   6   0   0   0   0   0   0   0     6
##   Crime                   0    0   1   0   0   0   2   1   0   0     4
##   Demography              1    0   9   2   0   0   3   0   0   2    17
##   Ecology                 0    0  23   0   0   0   0   0   0   0    23
##   Finance                 0    0  23   5   0   0  20   0   2   1    51
##   Health                  0    0   8   0   0   0   6   0   1   0    15
##   Hydrology               0    0  42   0   0   0  78   1   0   6   127
##   Industry                0    0   9   0   0   0   2   0   1   0    12
##   Labour market           0    0   3   4   0   0  17   0   0   0    24
##   Macroeconomic           0    0  18  33   0   0   5   0   0   0    56
##   Meteorology             0    0  18   0   0   0  17   0   0  12    47
##   Microeconomic           0    0  27   1   0   0   7   0   1   0    36
##   Miscellaneous           0    0   4   0   1   1   3   0   1   0    10
##   Physics                 0    0  12   0   0   0   4   0   0   0    16
##   Production              0    0   4  14   0   0  28   1   1   0    48
##   Sales                   0    0  10   3   0   0  24   0   9   0    46
##   Sport                   0    1   1   0   0   0   0   0   0   0     2
##   Transport and tourism   0    0   1   1   0   0  12   0   0   0    14
##   Tree-rings              0    0  34   0   0   0   1   0   0   0    35
##   Utilities               0    0   2   1   0   0   8   0   0   0    11
##   Total                   1    1 300  64   1   1 240   3  16  21   648

Import and plotting data from tsdl

sales <-  tsdl[[4]] # import sales data from tsdl
autoplot(sales) #plotting time series sales data

Check missing data

anyNA(sales)
## [1] FALSE

-> False, it means data is complete and no missing data

Decomposition

sales_dc <- sales %>%
  decompose(type = "multiplicative") %>% 
  autoplot()
sales_dc

Sales data are type of multiplicative and has trend and seasonality

1. Using Exponential Smoothing Model

Cross validation

#20% untuk data test
test_sales <-  tail(sales, 42)

#80% untuk data train 
train_sales <-  head(sales, length(sales)-42)

Fitting Model Triple Exponential Smooting/Holt Winters Exponential Smoothing

Based on decompose result, sales data has a trend and seasonality. for the next step, will using Triple Exponential Smooting/Holt Winters Exponential Smoothing

sales_ses <-  HoltWinters(train_sales, seasonal = "multiplicative", )
sales_ses
## Holt-Winters exponential smoothing with trend and multiplicative seasonal component.
## 
## Call:
## HoltWinters(x = train_sales, seasonal = "multiplicative")
## 
## Smoothing parameters:
##  alpha: 0.01480706
##  beta : 0.2034893
##  gamma: 0.3541218
## 
## Coefficients:
##             [,1]
## a   1.916298e+05
## b   8.376888e+02
## s1  1.185041e+00
## s2  1.151593e+00
## s3  1.061319e+00
## s4  1.056244e+00
## s5  9.994617e-01
## s6  1.019175e+00
## s7  8.875402e-01
## s8  8.736523e-01
## s9  9.472530e-01
## s10 9.455824e-01
## s11 1.069419e+00
## s12 1.095952e+00

Forecasting

sales_forecast <-  forecast(object = sales_ses, h =36)

visualisasi actual vs forecasting

sales %>% 
  autoplot(series = "actual") +
  autolayer(sales_forecast$fitted, series = "train") +
  autolayer(sales_forecast$mean, series = "test") +
  theme_minimal()
## Warning: Removed 12 row(s) containing missing values (geom_path).

## Evaluation Model

eval_ses <- accuracy(sales_forecast, test_sales) 
eval_ses
##                     ME      RMSE      MAE       MPE     MAPE      MASE
## Training set  892.4036  5559.481 4202.427 0.6126137 2.791996 0.5465397
## Test set     3778.5320 10318.195 8266.382 1.6463648 3.797144 1.0750707
##                    ACF1 Theil's U
## Training set -0.2021575        NA
## Test set     -0.1576150 0.5358628

result from error of SES model. we get the error 3.7%

2. Using SARIMA Model

Stationary check

library(tseries)
adf.test(sales)
## Warning in adf.test(sales): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  sales
## Dickey-Fuller = -10.096, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary

Data is not stationary, because p-value > 0.05

Data need to differencing, because any seasonality and trend

sales_diff <- sales %>% 
  diff(lag=12) %>% 
  diff(lag=1) 

sales_diff %>% 
  autoplot()

## fitting model auto

sales_auto <- auto.arima(y = train_sales, seasonal = T)
summary(sales_auto)
## Series: train_sales 
## ARIMA(3,1,1)(0,1,1)[12] 
## 
## Coefficients:
##           ar1      ar2     ar3      ma1     sma1
##       -0.3517  -0.1234  0.4087  -0.8320  -0.7736
## s.e.   0.1243   0.1352  0.1160   0.0878   0.0957
## 
## sigma^2 estimated as 19522851:  log likelihood=-1348.41
## AIC=2708.81   AICc=2709.46   BIC=2726.33
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE        ACF1
## Training set 682.6604 4144.889 3074.124 0.342796 2.014285 0.3998001 -0.02807681

Visualization act vs auto SARIMA

train_sales %>% 
  autoplot(series = "actual") +
  autolayer(sales_auto$fitted, series = "SARIMA auto") +
  theme_minimal()

### Fitting SARIMA model manually

tsdisplay(sales_diff) # yang dicek adalah data yang sudah didifferencing

## setting the parameters of SARIMA based on ACF and PACF graph in above

sales_sarima1 <- Arima(y = train_sales, order = c(1,1,1),
                       seasonal = list(order = c(0,1,2), period = 12))

sales_sarima2 <- Arima(y = train_sales, order = c(1,1,2), 
                       seasonal = list(order = c(0,1,2), period = 12))

sales_sarima3 <- Arima(y = train_sales, order = c(2,1,1), 
                       seasonal = list(order = c(0,1,2), period = 12))

sales_sarima4 <- Arima(y = train_sales, order = c(2,1,2), 
                       seasonal = list(order = c(0,1,2), period = 12))

Summarize error from all SARIMA model

summary(sales_auto) 
## Series: train_sales 
## ARIMA(3,1,1)(0,1,1)[12] 
## 
## Coefficients:
##           ar1      ar2     ar3      ma1     sma1
##       -0.3517  -0.1234  0.4087  -0.8320  -0.7736
## s.e.   0.1243   0.1352  0.1160   0.0878   0.0957
## 
## sigma^2 estimated as 19522851:  log likelihood=-1348.41
## AIC=2708.81   AICc=2709.46   BIC=2726.33
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE        ACF1
## Training set 682.6604 4144.889 3074.124 0.342796 2.014285 0.3998001 -0.02807681
summary(sales_sarima1)
## Series: train_sales 
## ARIMA(1,1,1)(0,1,2)[12] 
## 
## Coefficients:
##           ar1      ma1     sma1     sma2
##       -0.3080  -0.8172  -0.6716  -0.3284
## s.e.   0.0903   0.0529   0.2479   0.1098
## 
## sigma^2 estimated as 21116825:  log likelihood=-1361.16
## AIC=2732.31   AICc=2732.77   BIC=2746.91
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE     MASE        ACF1
## Training set 698.8819 4327.076 3221.616 0.3375867 2.120399 0.418982 -0.09897742
summary(sales_sarima2)
## Series: train_sales 
## ARIMA(1,1,2)(0,1,2)[12] 
## 
## Coefficients:
##           ar1      ma1     ma2     sma1     sma2
##       -0.0601  -1.1251  0.3094  -0.7184  -0.2814
## s.e.   0.1861   0.1712  0.1526   0.2880   0.1132
## 
## sigma^2 estimated as 20651517:  log likelihood=-1359.28
## AIC=2730.56   AICc=2731.21   BIC=2748.08
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 616.2677 4263.019 3146.165 0.2846527 2.072302 0.4091692 -0.0320436
summary(sales_sarima3)
## Series: train_sales 
## ARIMA(2,1,1)(0,1,2)[12] 
## 
## Coefficients:
##           ar1      ar2      ma1     sma1     sma2
##       -0.8613  -0.6412  -0.2655  -0.7753  -0.1002
## s.e.   0.1101   0.0947   0.1603   0.1272   0.1018
## 
## sigma^2 estimated as 19554679:  log likelihood=-1350.7
## AIC=2713.39   AICc=2714.04   BIC=2730.91
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 444.2552 4148.267 3045.339 0.1894393 1.995946 0.3960566 0.00254229
summary(sales_sarima4)
## Series: train_sales 
## ARIMA(2,1,2)(0,1,2)[12] 
## 
## Coefficients:
##           ar1      ar2      ma1      ma2     sma1     sma2
##       -0.9118  -0.5992  -0.2531  -0.2028  -0.7570  -0.1143
## s.e.   0.1061   0.0910   0.1262   0.1301   0.1265   0.1019
## 
## sigma^2 estimated as 19410090:  log likelihood=-1349.58
## AIC=2713.16   AICc=2714.03   BIC=2733.6
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 553.8924 4117.217 3011.831 0.2550545 1.973529 0.3916987
##                     ACF1
## Training set 0.001816106

the result from modelling SARIMA auto and manual:

SARIMA(3,1,1)(0,1,1)[12] -> auto 2.01% SARIMA(1,1,1)(0,1,2)[12] -> model sales_sarima_1 2.12% SARIMA(1,1,2)(0,1,2)[12] -> model sales_sarima_2 2.07% SARIMA(2,1,1)(0,1,2)[12] -> model sales_sarima_3 1.99% SARIMA(2,1,2)(0,1,2)[12] -> model sales_sarima_4 1.97%

Summary : sales_sarima4 are the best model compared with other models, if we see the MAPE error from training dataset

Forecasting

sales_forecast <- forecast(sales_sarima4, h = length(test_sales))

Evaluate model using data test

accuracy(sales_forecast, test_sales)
##                     ME      RMSE      MAE       MPE     MAPE      MASE
## Training set  553.8924  4117.217 3011.831 0.2550545 1.973529 0.3916987
## Test set     6737.7634 11174.302 8843.631 2.8675232 3.918884 1.1501438
##                      ACF1 Theil's U
## Training set  0.001816106        NA
## Test set     -0.053297640 0.6113557

if data test imlemented into SARIMA model, the error still acceptable under 5%. we still get 3.9% from this model.

Visualize actual data and forecasting

library(plotly)
## Warning: package 'plotly' was built under R version 4.0.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
test_forecast(actual = sales, 
              forecast.obj = sales_forecast, 
              train = train_sales, 
              test= test_sales)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Assumption

# menggunakan visualisasi
acf(sales_sarima4$residuals)

# menggunakan Ljung-Box test
Box.test(x = sales_sarima4$residuals, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  sales_sarima4$residuals
## X-squared = 0.0005047, df = 1, p-value = 0.9821

Summary : For sales_sarima4 model, it was no autocorrelation for each predictor.

Normality of residuals

shapiro.test(sales_sarima4$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  sales_sarima4$residuals
## W = 0.97903, p-value = 0.02158

Summary : Residuals are has not normal distribution. which means prediction result has big error if compared with others data.