library(fpp2)
library(knitr)
SUGAR = read.csv("C:/ECON 4210/as2_data.csv")
df = SUGAR
df = ts(df, start=1990, frequency=12)
y = df[,"SUGAR"]
y %>%
stl(t.window=13, s.window="periodic", robust=TRUE) %>%
autoplot()
Prior to evaluating the different forecasting models, it is necessary to examin the properties of the data. By using Loess (stl), we can gain a deeper understanding of the tren, seasonality and remainder in the graph. the data appears to be non-stationary as the values tend to increase over the years, except for the financial crisis in 2008-2009. The data also shows a lot of seasonality. The remainder component stays relatively the same with a few outliers, such as the finacial crisis in 2008-2009.
We will use data to the end of 2014 for training and data from January 2015 to August 2019 for testing.
train <- window(y,start=c(1990, 1),end=c(2014, 12))
test <- window(y,start=2015)
both <- window(y,start=c(1990, 1))
h=length(test)
tps = tslm(train ~ trend + season)
fit1 = forecast(tps, h=h)
p1 = autoplot(fit1) + ylab("SUGAR")
STL <- stl(train, t.window=15, s.window="periodic", robust=TRUE)
fit2 <- forecast(STL, method="rwdrift",h=h)
p2 = autoplot(fit2)
ETS <- ets(train, model="ZZZ")
fit3 <- forecast(ETS, h=h)
p3 = autoplot(fit3) + ylab("SUGAR")
ndiffs(y)
## [1] 0
0 differences are required for the dataset. auto.arima will ensure this happen.
# Auto Arima
fit4 = forecast(auto.arima(train), h=h)
p4 = autoplot(fit4) + ylab("SUGAR")
fit5 = rwf(train, h=h, drift = TRUE)
p5 = autoplot(fit5) + ylab("SUGAR")
gridExtra::grid.arrange(p1, p2, p3, nrow=3)
gridExtra::grid.arrange(p4, p5, nrow=2)
It appears that all of the models except for random walf drift have captured the seasonality and trend in the data. Random walk drift captures the trend but ignores the seasonality component. The other 4 models capture the seasonality well but they don’t capture the trend so well. Furthermore, ETS has a larger confidence band than TSLM, STL and ARIMA.
a1 = accuracy(fit1, test)
a2 = accuracy(fit2, test)
a3 = accuracy(fit3, test)
a4 = accuracy(fit4, test)
a5 = accuracy(fit5, test)
a.table<-rbind(a1, a2, a3, a4, a5)
row.names(a.table) = c('TSLM training','TSLM test','STL training','STL test','ETS training','ETS test','ARIMA training','ARIMA test','RW training','RW test')
a.table<-as.data.frame(a.table)
a.table<-a.table[order(a.table$MASE),]
a.table
## ME RMSE MAE MPE MAPE
## ARIMA training 4.455841e-02 3.521420 2.617688 -0.03631179 2.413840
## ETS training 1.063591e-02 3.611275 2.722486 -0.05983491 2.510413
## STL training -1.520768e-15 3.747449 2.830674 -0.04703100 2.610017
## RW training 3.803327e-16 7.776956 5.776888 -0.24697590 5.278488
## STL test 5.535326e+00 7.838843 6.593663 4.90642090 5.801502
## TSLM training 1.467255e-15 8.181900 6.969203 -0.60118175 6.544394
## ETS test 6.290508e+00 8.486246 7.052546 5.55210633 6.209171
## ARIMA test 7.709842e+00 9.769035 8.203006 6.81318414 7.242044
## TSLM test 9.619135e+00 11.327962 9.689007 8.48741430 8.544645
## RW test -1.495833e+01 16.938159 14.958335 -13.70688327 13.706883
## MASE ACF1 Theil's U
## ARIMA training 0.5156512 0.000283213 NA
## ETS training 0.5362951 0.038634101 NA
## STL training 0.5576066 -0.231769996 NA
## RW training 1.1379732 0.257288876 NA
## STL test 1.2988675 0.651999435 1.359940
## TSLM training 1.3728441 0.891666568 NA
## ETS test 1.3892615 0.673704404 1.474194
## ARIMA test 1.6158874 0.711762211 1.708774
## TSLM test 1.9086107 0.698444987 1.970136
## RW test 2.9466009 0.734942375 3.062910
For the purpose of this question, ETS performed the best since it had a lower MAPE and MASE than the other models. However, there are other models, such as SLT, that might fit better with the data. The results show that SLT has a lower MAPE and MASE than ETS. ARIMA was close behind the ETS model.
train <- window(y,start=c(1990, 1),end=c(2019), time=8)
test <- window(y,start=c(2019), time=9)
both <- window(y,start=c(1990, 1))
h=length(test)
ETS <- ets(train, model="ZZZ")
fit2 <- forecast(ETS, h=13)
autoplot(y, series = "Actual") +
ggtitle("Total Sugar production") +
autolayer(fit2, PI = FALSE, series = "ETS Forecast") +
ylab("Monthly Sugar Production (Million Metric Tons)")+
theme(plot.title = element_text(size=15, face = "bold",hjust = 0.5) ,
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10))
The forecasted values from September 2019 to September 2020 are very similar to the actual data and are quite useful for predictions.
# ETS method
ets_function <- function(x, h) {
ETS <- ets(x, model="ZZZ")
forecast(ETS, h=h)
}
# STL Method
stl_function <- function(x, h) {
STL <- stl(x, t.window=15, s.window="periodic", robust=TRUE)
forecast(STL, method="rwdrift",h=h)
}
# trend plus seasonal
tslm_function <- function(x, h) {
tps <- tslm(x ~ trend + season)
forecast(tps,h=h)
}
## arima
arima_function <- function(x, h) {
forecast(auto.arima(x), h=h)
}
#Time Series Cross Validation I windowed the data to 60 observations which is technically till end of 1994
window_data <- window(y, end=c(1994,12))
# Calling functions and generating tsCV
e1 <- tsCV(window_data, ets_function, h=6)
e2 <- tsCV(window_data, stl_function, h=6)
e3 <- tsCV(window_data, tslm_function, h=6)
e4 <- tsCV(window_data, arima_function, h=6)
mse1 = mean(e1^2, na.rm=TRUE)
mse2 = mean(e2^2, na.rm=TRUE)
mse3 = mean(e3^2, na.rm=TRUE)
mse4 = mean(e4^2, na.rm=TRUE)
a.table<-rbind(mse1, mse2, mse3, mse4)
row.names(a.table)<-c('ETS MSE', 'STL MSE', 'TSLM MSE', 'ARIMA MSE')
a.table<-as.data.frame(a.table)
a.table
## V1
## ETS MSE 207.09058
## STL MSE 14.65243
## TSLM MSE 55.95463
## ARIMA MSE 319.42737
To produce the forecasts, we will use nested loops
models = c("ets_function","stl_function","tslm_function",
"arima_function")
# Nested loop -> R is not very fast in loops. + Forecasting
for (mdl in models) {
mse = c() # Initiate an empty data to store mse values for each model
for (i in c(1,2,3,4,5,6)){
e <- tsCV(window_data, get(mdl), h=i)
mse[i] = mean(e^2, na.rm=TRUE)
}
plot(mse,ann=FALSE)+
title(main=mdl, col.main="red", font.main=4)+
title(xlab="h")+
title(ylab="MSE")
}
Overall, the STL model performed the best and had the smallest MSE. In each of the forecasts, the MSE increases with each increase in period.