#install.packages("devtools")
#devtools::install_github("FinYang/tsdl")
library(tsdl)
library(forecast)
library(expsmooth)
library(ggplot2)
library(seasonal)
library(rugarch)
The data for this assignment is Australian Monthly Confectionery Production from 1957-1995, from the tsdl() package.
conf<- tsdl[[117]]
df<- window(conf, start = c(1979,1))
We set aside 24 months of data for testing, and use a subset of data starting from January 1979 for the training set.
length_test_set = 24
h = 12
data = ts(df,start = c(1979,1),frequency = 12)
train_data = ts(df[1:(length(df)-length_test_set)], start = c(1979,1),frequency = h)
autoplot(train_data) +
ggtitle("Production of Confectionery") +
xlab("Year") + ylab("Production") + theme_bw()
The data has non-linear trend, cycles, clear seasonality. Seasonality appears to be variable with at least 3 peaks during the year. There is a significant decline in January.
ggseasonplot(train_data, year.labels=TRUE, year.labels.left=TRUE) + ylab("Production") +
ggtitle("Seasonal plot: Monthly Chocolate Production") + theme_bw()
ggseasonplot(train_data, polar=TRUE) + ylab("Production") +
ggtitle("Polar seasonal plot: Monthly Chocolate Production") + theme_bw()
The seasonal pattern has been changing since 1988, with August now being one of the peak production months. The mean is quite variable across seasons.
ggsubseriesplot(train_data) + ylab("Sales") +
ggtitle("Seasonal subseries plot: Monthly Chocolate Production") + theme_bw()
## Autocorrelation
There is significant autocorrelation, especially at lags 12 and 24.
Acf(train_data)
pacf(train_data)
## Decomposition
fit = stl(train_data,t.window=13, s.window="periodic", robust=TRUE)
autoplot(fit)
# Seasonally adjusted time series
autoplot(train_data, series="Data") +
autolayer(trendcycle(fit), series="Trend") +
autolayer(seasadj(fit), series="Seasonally Adjusted") +
xlab("Year") + ylab("Tonnes") +
ggtitle("Monthly Chocolate Production") + theme_bw() +
scale_colour_manual(values=c("gray","blue","red"),breaks=c("Data","Seasonally Adjusted","Trend"))
To evaluate our models, we use time series cross-validation, using a rolling 7 years as the validation period. The function tsCV() works for any forecasting function that returns an object of class forecast.
models_mae = NULL
initial_obs = 84
model_name = "Naïve"
model = naive
my_forecast_plot(model,model_name)
# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h)
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model Naïve : 1382.1"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))
model_name = "Seasonal Naïve"
model = snaive
my_forecast_plot(model,model_name)
# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h)
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model Seasonal Naïve : 768.3"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))
model_name = "RW+Decomposition"
model = function(y, h){
stl_decomposition = stl(y, t.window=13, s.window="periodic",robust=TRUE)
return(forecast(stl_decomposition,method = "naive",h=h))
}
# perform decomposition
my_forecast_plot(model,model_name)
# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h)
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model RW+Decomposition : 843.2"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))
model_name = "Exp Smoothing"
model = function(y, h){
fitted_model = ets(y)
return(forecast(fitted_model,h=h))
}
# perform decomposition
my_forecast_plot(model,model_name)
# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h)
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model Exp Smoothing : 636.2"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))
model_name = "Exp Smoothing+Decomposition"
model = function(y, h){
stl_decomposition = stl(y, t.window=13, s.window="periodic",robust=TRUE)
return(forecast(stl_decomposition,method = "ets",h=h))
}
# perform decomposition
my_forecast_plot(model,model_name)
autoplot(model(train_data,24))
# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h)
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model Exp Smoothing+Decomposition : 656.9"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))
print(auto.arima(train_data,stepwise = FALSE,approximation = FALSE))
## Series: train_data
## ARIMA(1,0,1)(1,1,1)[12] with drift
##
## Coefficients:
## ar1 ma1 sar1 sma1 drift
## 0.8876 -0.7453 -0.2960 -0.4393 25.1623
## s.e. 0.0814 0.1152 0.1265 0.1297 3.9930
##
## sigma^2 estimated as 367762: log likelihood=-1284.26
## AIC=2580.51 AICc=2581.05 BIC=2599.11
model_name = "SARIMA"
model = function(y, h){forecast(Arima(y, order=c(3,1,0), seasonal = c(0,1,1)), h=h)}
my_forecast_plot(model,model_name)
# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h)
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model SARIMA : 621"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))
stl_decomposition = stl(train_data, t.window=13, s.window="periodic",robust=TRUE)
print(auto.arima(seasadj(stl_decomposition),stepwise = FALSE,approximation = FALSE))
## Series: seasadj(stl_decomposition)
## ARIMA(0,1,2)(0,0,2)[12] with drift
##
## Coefficients:
## ma1 ma2 sma1 sma2 drift
## -0.7608 -0.1641 0.0532 0.3327 25.4675
## s.e. 0.0754 0.0817 0.0766 0.0860 4.6983
##
## sigma^2 estimated as 327630: log likelihood=-1359.23
## AIC=2730.47 AICc=2730.97 BIC=2749.46
model_name = "ARIMA + Decomposition"
model = function(y, h){
fit = stlm(y,s.window="periodic", robust=TRUE,modelfunction=Arima, order=c(3,1,1))
return(forecast(fit,h=h))
}
my_forecast_plot(model,model_name)
# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h)
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model ARIMA + Decomposition : 692"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))
model_name = "GARCH + Decomposition"
model = function(y, h){
# decomposition
stl_decomposition = stl(y, t.window=13, s.window="periodic",robust=TRUE)
# garch on y_adjusted
y_adjusted = seasadj(stl_decomposition)
diff_y_adjusted = diff(y_adjusted)
ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1,1)),
mean.model = list(armaOrder = c(3, 1))) %>%
ugarchfit(diff_y_adjusted, solver = 'hybrid',numderiv.control = list(hess.zero.tol=1e-7)) %>%
ugarchforecast(n.ahead = h) %>%
fitted %>%
ts(frequency = h,start = next_time_step(y)) ->output_garch
# add last observation:
output_garch = output_garch + y_adjusted[length(y_adjusted)]
# get last year seasonality
tmp = end(y)
tmp[1] = tmp[1]-1
new_start = next_time_step(tmp,is_end = T)
last_year_seasonality = window(seasonal(stl_decomposition),start = new_start)
# sum them up
fcast = output_garch+ts(last_year_seasonality,frequency = h,start = start(output_garch))
# make it a forecast object
fcast = structure(list(mean = fcast), class='forecast')
return(fcast)
}
my_forecast_plot(model,model_name)
# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h)
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model GARCH + Decomposition : 832.2"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))
model_name = "NNETAR"
model = function(y, h){forecast(nnetar(y,p = 5, P = 2), h=h)}
my_forecast_plot(model,model_name)
# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h)
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model NNETAR : 671.7"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))
model_name = "NNETAR + Decomposition"
model = function(y, h){
fit = stlm(y,s.window="periodic", robust=TRUE,modelfunction=nnetar, p=2,P=0)
return(forecast(fit,h=h))
}
my_forecast_plot(model,model_name)
# compute CV MAE
cv_residuals = tsCV(train_data,model,initial = initial_obs,h=h)
mae = colMeans(abs(cv_residuals), na.rm = T)
print(paste("Average MAE across all the horizons for model",model_name,":",round(mean(mae),1)))
## [1] "Average MAE across all the horizons for model NNETAR + Decomposition : 794.3"
# store MAE
models_mae = rbind(models_mae,data.frame(model = model_name, mae,horizon = 1:h,row.names = NULL))
print_scores(models_mae)
## [1] "Naïve : 1382.06"
## [1] "Seasonal Naïve : 768.27"
## [1] "RW+Decomposition : 843.21"
## [1] "Exp Smoothing : 636.23"
## [1] "Exp Smoothing+Decomposition : 656.87"
## [1] "SARIMA : 621.03"
## [1] "ARIMA + Decomposition : 692.03"
## [1] "GARCH + Decomposition : 832.19"
## [1] "NNETAR : 671.65"
## [1] "NNETAR + Decomposition : 794.35"
The SARIMA gives the lowest Average MAE across all forecast horizons.
get_best_models(models_mae)
## model mae horizon
## 1 SARIMA 553.7887 1
## 2 SARIMA 559.0715 2
## 3 SARIMA 576.2091 3
## 4 SARIMA 572.8799 4
## 5 SARIMA 626.1862 5
## 6 SARIMA 652.5380 6
## 7 Exp Smoothing 670.4610 7
## 8 Exp Smoothing 660.2437 8
## 9 Exp Smoothing 634.9832 9
## 10 Exp Smoothing 604.6789 10
## 11 Exp Smoothing 588.1677 11
## 12 Exp Smoothing+Decomposition 622.5928 12
After forecast horizon 5, Exponential Smoothing with Decomposition performs consistently better in terms of MAE.
plot_scores(models_mae[models_mae$model %in% c("SARIMA","NNETAR","Exp Smoothing","Exp Smoothing+Decomposition"),])
mae_test = NULL
# fit the best model on training data
fitted_ets = ets(train_data, model = "ZZZ")
cbind('Residuals' = residuals(fitted_ets),
'Forecast errors' = residuals(fitted_ets,type='response')) %>%
autoplot(facet=TRUE) + xlab("Year") + ylab("")
library(magrittr)
fitted_ets %>% forecast(h = h) %>%
autoplot() + ylab("Production in Tonnes")