PART A

load packages

library(fpp2)
library(knitr)

Loading dataset

SUGAR = read.csv("C:/ECON 4210/as2_data.csv")
df = SUGAR
df = ts(df, start=1990, frequency=12)
y = df[,"SUGAR"]

Understanding the Data

y %>%
  stl(t.window=13, s.window="periodic", robust=TRUE) %>%
  autoplot()

Prior to evaluating the different forecasting models, it is necessary to examin the properties of the data. By using Loess (stl), we can gain a deeper understanding of the tren, seasonality and remainder in the graph. the data appears to be non-stationary as the values tend to increase over the years, except for the financial crisis in 2008-2009. The data also shows a lot of seasonality. The remainder component stays relatively the same with a few outliers, such as the finacial crisis in 2008-2009.

Training data & Test data

We will use data to the end of 2014 for training and data from January 2015 to August 2019 for testing.

train <- window(y,start=c(1990, 1),end=c(2014, 12))

test <- window(y,start=2015)
both <- window(y,start=c(1990, 1))
h=length(test)

TSLM and STL Method

tps = tslm(train ~ trend + season)
fit1 = forecast(tps, h=h)
p1 = autoplot(fit1) + ylab("SUGAR")

STL <- stl(train, t.window=15, s.window="periodic", robust=TRUE)
fit2 <- forecast(STL, method="rwdrift",h=h)
p2 = autoplot(fit2)

ETS method

ETS <- ets(train, model="ZZZ") 
fit3 <- forecast(ETS, h=h)
p3 = autoplot(fit3) + ylab("SUGAR")

ARIMA method

ndiffs(y)
## [1] 0

0 differences are required for the dataset. auto.arima will ensure this happen.

# Auto Arima
fit4 = forecast(auto.arima(train), h=h)
p4 = autoplot(fit4) + ylab("SUGAR")

Random walk

fit5 = rwf(train, h=h, drift = TRUE)
p5 = autoplot(fit5) + ylab("SUGAR")

Graph models

TSLM, STL, and ETS Method Forecasts

gridExtra::grid.arrange(p1, p2, p3, nrow=3)

ARIMA and Random Walk Method Forecasts

gridExtra::grid.arrange(p4, p5, nrow=2)

It appears that all of the models except for random walf drift have captured the seasonality and trend in the data. Random walk drift captures the trend but ignores the seasonality component. The other 4 models capture the seasonality well but they don’t capture the trend so well. Furthermore, ETS has a larger confidence band than TSLM, STL and ARIMA.

Accuracy measures

a1 = accuracy(fit1, test)
a2 = accuracy(fit2, test)
a3 = accuracy(fit3, test)
a4 = accuracy(fit4, test)
a5 = accuracy(fit5, test)

a.table<-rbind(a1, a2, a3, a4, a5)
row.names(a.table) = c('TSLM training','TSLM test','STL training','STL test','ETS training','ETS test','ARIMA training','ARIMA test','RW training','RW test')

a.table<-as.data.frame(a.table)
a.table<-a.table[order(a.table$MASE),]
a.table
##                           ME      RMSE       MAE          MPE      MAPE
## ARIMA training  4.455841e-02  3.521420  2.617688  -0.03631179  2.413840
## ETS training    1.063591e-02  3.611275  2.722486  -0.05983491  2.510413
## STL training   -1.520768e-15  3.747449  2.830674  -0.04703100  2.610017
## RW training     3.803327e-16  7.776956  5.776888  -0.24697590  5.278488
## STL test        5.535326e+00  7.838843  6.593663   4.90642090  5.801502
## TSLM training   1.467255e-15  8.181900  6.969203  -0.60118175  6.544394
## ETS test        6.290508e+00  8.486246  7.052546   5.55210633  6.209171
## ARIMA test      7.709842e+00  9.769035  8.203006   6.81318414  7.242044
## TSLM test       9.619135e+00 11.327962  9.689007   8.48741430  8.544645
## RW test        -1.495833e+01 16.938159 14.958335 -13.70688327 13.706883
##                     MASE         ACF1 Theil's U
## ARIMA training 0.5156512  0.000283213        NA
## ETS training   0.5362951  0.038634101        NA
## STL training   0.5576066 -0.231769996        NA
## RW training    1.1379732  0.257288876        NA
## STL test       1.2988675  0.651999435  1.359940
## TSLM training  1.3728441  0.891666568        NA
## ETS test       1.3892615  0.673704404  1.474194
## ARIMA test     1.6158874  0.711762211  1.708774
## TSLM test      1.9086107  0.698444987  1.970136
## RW test        2.9466009  0.734942375  3.062910

For the purpose of this question, ETS performed the best since it had a lower MAPE and MASE than the other models. However, there are other models, such as SLT, that might fit better with the data. The results show that SLT has a lower MAPE and MASE than ETS. ARIMA was close behind the ETS model.

Actual + Forecasted

train <- window(y,start=c(1990, 1),end=c(2019), time=8)
test <- window(y,start=c(2019), time=9)
both <- window(y,start=c(1990, 1))
h=length(test)

ETS <- ets(train, model="ZZZ") 
fit2 <- forecast(ETS, h=13)

autoplot(y, series = "Actual") +
ggtitle("Total Sugar production") +
  autolayer(fit2, PI = FALSE, series = "ETS Forecast") +
  ylab("Monthly Sugar Production (Million Metric Tons)")+
  theme(plot.title = element_text(size=15, face = "bold",hjust = 0.5) ,
        axis.text.x = element_text(size=10),
        axis.text.y = element_text(size=10))

The forecasted values from September 2019 to September 2020 are very similar to the actual data and are quite useful for predictions.

PART B

Introducing the Functions

# ETS  method
ets_function <- function(x, h) {
  ETS <- ets(x, model="ZZZ") 
  forecast(ETS, h=h)
}
# STL Method
stl_function <- function(x, h) {
  STL <- stl(x, t.window=15, s.window="periodic", robust=TRUE)
  forecast(STL, method="rwdrift",h=h)
}
# trend plus seasonal
tslm_function <- function(x, h) {
  tps <- tslm(x ~ trend + season)
  forecast(tps,h=h)
}
## arima
arima_function <- function(x, h) {
  forecast(auto.arima(x), h=h)
}

#Time Series Cross Validation I windowed the data to 60 observations which is technically till end of 1994

window_data <- window(y, end=c(1994,12))

# Calling functions and generating tsCV 
e1 <- tsCV(window_data, ets_function, h=6)
e2 <- tsCV(window_data, stl_function, h=6)
e3 <- tsCV(window_data, tslm_function, h=6)
e4 <- tsCV(window_data, arima_function, h=6)

MSE of each model class

mse1 = mean(e1^2, na.rm=TRUE)
mse2 = mean(e2^2, na.rm=TRUE)
mse3 = mean(e3^2, na.rm=TRUE)
mse4 = mean(e4^2, na.rm=TRUE)

MSE table

a.table<-rbind(mse1, mse2, mse3, mse4)
row.names(a.table)<-c('ETS MSE', 'STL MSE', 'TSLM MSE', 'ARIMA MSE') 
a.table<-as.data.frame(a.table)
a.table
##                  V1
## ETS MSE   207.09058
## STL MSE    14.65243
## TSLM MSE   55.95463
## ARIMA MSE 319.42737

MSE against Forecast period

To produce the forecasts, we will use nested loops

models = c("ets_function","stl_function","tslm_function",
           "arima_function")

# Nested loop -> R is not very fast in loops. + Forecasting
for (mdl in models) {
  mse = c() # Initiate an empty data to store mse values for each model
  for (i in c(1,2,3,4,5,6)){  
    e <- tsCV(window_data, get(mdl), h=i)
    mse[i] = mean(e^2, na.rm=TRUE)
  }
  plot(mse,ann=FALSE)+
    title(main=mdl, col.main="red", font.main=4)+
    title(xlab="h")+
    title(ylab="MSE")
}

Detailed breakdown

Overall, the STL model performed the best and had the smallest MSE. In each of the forecasts, the MSE increases with each increase in period.