#install.packages("devtools")
#devtools::install_github("FinYang/tsdl")

library(tsdl)
library(forecast)
library(expsmooth)
library(ggplot2)
library(seasonal)
library(rugarch)

Getting Data

The data for this assignment is Australian Monthly Confectionery Production from 1957-1995, from the tsdl() package.

conf<- tsdl[[117]]

df<- window(conf, start = c(1979,1))

Creating Training and Test Set

We set aside 24 months of data for testing, and use a subset of data starting from January 1979 for the training set.

length_test_set = 24
h = 12

data = ts(df,start = c(1979,1),frequency = 12)
train_data = ts(df[1:(length(df)-length_test_set)], start = c(1979,1),frequency = h)

EXPLORATION

autoplot(train_data) +
  ggtitle("Production of Confectionery") + 
  xlab("Year") + ylab("Production") + theme_bw()

The data has non-linear trend, cycles, clear seasonality. Seasonality appears to be variable with at least 3 peaks during the year. There is a significant decline in January.

ggseasonplot(train_data, year.labels=TRUE, year.labels.left=TRUE) + ylab("Production") +
  ggtitle("Seasonal plot: Monthly Chocolate Production") + theme_bw()

ggseasonplot(train_data, polar=TRUE) + ylab("Production") +
  ggtitle("Polar seasonal plot: Monthly Chocolate Production") + theme_bw()

The seasonal pattern has been changing since 1988, with August now being one of the peak production months. The mean is quite variable across seasons.

ggsubseriesplot(train_data)  + ylab("Sales") +
  ggtitle("Seasonal subseries plot: Monthly Chocolate Production") + theme_bw()

## Autocorrelation

There is significant autocorrelation, especially at lags 12 and 24.

Acf(train_data)

pacf(train_data)

## Decomposition

fit = stl(train_data,t.window=13, s.window="periodic", robust=TRUE)
autoplot(fit)

# Seasonally adjusted time series
autoplot(train_data, series="Data") + 
  autolayer(trendcycle(fit), series="Trend") + 
  autolayer(seasadj(fit), series="Seasonally Adjusted") +
  xlab("Year") + ylab("Tonnes") +
  ggtitle("Monthly Chocolate Production") + theme_bw() +
  scale_colour_manual(values=c("gray","blue","red"),breaks=c("Data","Seasonally Adjusted","Trend"))

Some utility functions

MODELS VALIDATION

To evaluate our models, we use time series cross-validation, using a rolling 7 years as the validation period. The function tsCV() works for any forecasting function that returns an object of class forecast.

models_mae = NULL
initial_obs = 84

Benchmark: Naive and Seasonal Naive

NAIVE (Benchmark)

model_name = "Naïve"
model = naive

my_forecast_plot(model,model_name)

# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h) 
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model Naïve : 1382.1"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))

SNAIVE (Benchmark)

model_name = "Seasonal Naïve"
model = snaive

my_forecast_plot(model,model_name)

# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h) 
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model Seasonal Naïve : 768.3"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))

MODELS

Random Walk + DECOMPOSITION

model_name = "RW+Decomposition"
model = function(y, h){
  stl_decomposition = stl(y, t.window=13, s.window="periodic",robust=TRUE)
  return(forecast(stl_decomposition,method = "naive",h=h))
}

# perform decomposition
my_forecast_plot(model,model_name)

# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h) 
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model RW+Decomposition : 843.2"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))

Exponential smoothing

model_name = "Exp Smoothing"
model = function(y, h){
  fitted_model = ets(y)
  return(forecast(fitted_model,h=h))
}

# perform decomposition
my_forecast_plot(model,model_name)

# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h) 
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model Exp Smoothing : 636.2"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))

Exponential smoothing + DECOMPOSITION

model_name = "Exp Smoothing+Decomposition"
model = function(y, h){
  stl_decomposition = stl(y, t.window=13, s.window="periodic",robust=TRUE)
  return(forecast(stl_decomposition,method = "ets",h=h))
}
# perform decomposition
my_forecast_plot(model,model_name)

autoplot(model(train_data,24))

# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h) 
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model Exp Smoothing+Decomposition : 656.9"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))

SARIMA

print(auto.arima(train_data,stepwise = FALSE,approximation = FALSE))
## Series: train_data 
## ARIMA(1,0,1)(1,1,1)[12] with drift 
## 
## Coefficients:
##          ar1      ma1     sar1     sma1    drift
##       0.8876  -0.7453  -0.2960  -0.4393  25.1623
## s.e.  0.0814   0.1152   0.1265   0.1297   3.9930
## 
## sigma^2 estimated as 367762:  log likelihood=-1284.26
## AIC=2580.51   AICc=2581.05   BIC=2599.11
model_name = "SARIMA"
model = function(y, h){forecast(Arima(y, order=c(3,1,0), seasonal = c(0,1,1)), h=h)}

my_forecast_plot(model,model_name)

# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h) 
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model SARIMA : 621"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))

ARIMA + Decomposition

stl_decomposition = stl(train_data, t.window=13, s.window="periodic",robust=TRUE)
print(auto.arima(seasadj(stl_decomposition),stepwise = FALSE,approximation = FALSE))
## Series: seasadj(stl_decomposition) 
## ARIMA(0,1,2)(0,0,2)[12] with drift 
## 
## Coefficients:
##           ma1      ma2    sma1    sma2    drift
##       -0.7608  -0.1641  0.0532  0.3327  25.4675
## s.e.   0.0754   0.0817  0.0766  0.0860   4.6983
## 
## sigma^2 estimated as 327630:  log likelihood=-1359.23
## AIC=2730.47   AICc=2730.97   BIC=2749.46
model_name = "ARIMA + Decomposition"
model = function(y, h){
  fit = stlm(y,s.window="periodic", robust=TRUE,modelfunction=Arima, order=c(3,1,1))
  return(forecast(fit,h=h))
}

my_forecast_plot(model,model_name)

# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h) 
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model ARIMA + Decomposition : 692"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))

GARCH

model_name = "GARCH + Decomposition"
model = function(y, h){
  # decomposition
  stl_decomposition = stl(y, t.window=13, s.window="periodic",robust=TRUE)
  
  # garch on y_adjusted 
  y_adjusted = seasadj(stl_decomposition)
  diff_y_adjusted = diff(y_adjusted)
  ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1,1)),
             mean.model = list(armaOrder = c(3, 1))) %>%
    ugarchfit(diff_y_adjusted, solver = 'hybrid',numderiv.control = list(hess.zero.tol=1e-7)) %>%
    ugarchforecast(n.ahead = h) %>% 
    fitted %>%
    ts(frequency = h,start = next_time_step(y)) ->output_garch
  
  # add last observation:
  output_garch = output_garch + y_adjusted[length(y_adjusted)]
  
  # get last year seasonality 
  tmp = end(y)
  tmp[1] = tmp[1]-1
  new_start = next_time_step(tmp,is_end = T)
  last_year_seasonality = window(seasonal(stl_decomposition),start = new_start)
  
  # sum them up
  fcast = output_garch+ts(last_year_seasonality,frequency = h,start = start(output_garch))
  
  # make it a forecast object
  fcast = structure(list(mean = fcast), class='forecast')
  return(fcast)
}

my_forecast_plot(model,model_name)

# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h) 
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model GARCH + Decomposition : 832.2"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))

NNETAR

model_name = "NNETAR"
model = function(y, h){forecast(nnetar(y,p = 5, P = 2), h=h)}

my_forecast_plot(model,model_name)

# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h) 
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model NNETAR : 671.7"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))

NNETAR + Decomposition

model_name = "NNETAR + Decomposition"
model = function(y, h){
  fit = stlm(y,s.window="periodic", robust=TRUE,modelfunction=nnetar, p=2,P=0)
  return(forecast(fit,h=h))
}
my_forecast_plot(model,model_name)

# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h) 
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model NNETAR + Decomposition : 794.3"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))

Results of Cross-Validation

print_scores(models_mae)
## [1] "Naïve : 1382.06"
## [1] "Seasonal Naïve : 768.27"
## [1] "RW+Decomposition : 843.21"
## [1] "Exp Smoothing : 636.23"
## [1] "Exp Smoothing+Decomposition : 656.87"
## [1] "SARIMA : 621.03"
## [1] "ARIMA + Decomposition : 692.03"
## [1] "GARCH + Decomposition : 832.19"
## [1] "NNETAR : 671.65"
## [1] "NNETAR + Decomposition : 794.35"

The SARIMA gives the lowest Average MAE across all forecast horizons.

get_best_models(models_mae)
##                          model      mae horizon
## 1                       SARIMA 553.7887       1
## 2                       SARIMA 559.0715       2
## 3                       SARIMA 576.2091       3
## 4                       SARIMA 572.8799       4
## 5                       SARIMA 626.1862       5
## 6                       SARIMA 652.5380       6
## 7                Exp Smoothing 670.4610       7
## 8                Exp Smoothing 660.2437       8
## 9                Exp Smoothing 634.9832       9
## 10               Exp Smoothing 604.6789      10
## 11               Exp Smoothing 588.1677      11
## 12 Exp Smoothing+Decomposition 622.5928      12

After forecast horizon 5, Exponential Smoothing with Decomposition performs consistently better in terms of MAE.

plot_scores(models_mae[models_mae$model %in% c("SARIMA","NNETAR","Exp Smoothing","Exp Smoothing+Decomposition"),])

Model Evaluation

mae_test = NULL

# fit the best model on training data
fitted_ets = ets(train_data, model = "ZZZ")

cbind('Residuals' = residuals(fitted_ets),
      'Forecast errors' = residuals(fitted_ets,type='response')) %>%
  autoplot(facet=TRUE) + xlab("Year") + ylab("")

library(magrittr)

fitted_ets %>% forecast(h = h) %>%
  autoplot() + ylab("Production in Tonnes")