library(fpp2)
library(knitr)
SUGAR = read.csv("C:/ECON 4210/as2_data.csv")
df = SUGAR
df = ts(df, start=1990, frequency=12)
y = df[,"SUGAR"]
y %>%
stl(t.window=13, s.window="periodic", robust=TRUE) %>%
autoplot()
Prior to evaluating the different forecasting models, it is necessary to examin the properties of the data. By using Loess (stl), we can gain a deeper understanding of the tren, seasonality and remainder in the graph. the data appears to be non-stationary as the values tend to increase over the years, except for the financial crisis in 2008-2009. The data also shows a lot of seasonality. The remainder component stays relatively the same with a few outliers, such as the finacial crisis in 2008-2009.
We will use data to the end of 2014 for training and data from January 2015 to August 2019 for testing.
train <- window(y,start=c(1990, 1),end=c(2014, 12))
test <- window(y,start=2015)
both <- window(y,start=c(1990, 1))
h=length(test)
tps = tslm(train ~ trend + season)
fit1 = forecast(tps, h=h)
p1 = autoplot(fit1) + ylab("SUGAR")
STL <- stl(train, t.window=15, s.window="periodic", robust=TRUE)
fit2 <- forecast(STL, method="rwdrift",h=h)
p2 = autoplot(fit2)
ETS <- ets(train, model="ZZZ")
fit3 <- forecast(ETS, h=h)
p3 = autoplot(fit3) + ylab("SUGAR")
ndiffs(y)
## [1] 0
0 differences are required for the dataset. Auto Arima will ensure this happens.
# Auto Arima
fit4 = forecast(auto.arima(train), h=h)
p4 = autoplot(fit4) + ylab("SUGAR")
fit5 = rwf(train, h=h, drift = TRUE)
p5 = autoplot(fit5) + ylab("SUGAR")
gridExtra::grid.arrange(p1, p2, p3, nrow=3)
gridExtra::grid.arrange(p4, p5, nrow=2)
It appears that all of the models except for random walf with drift have captured the seasonality and trend in the data. Random walk with drift captures the trend but ignores the seasonality component. The other 4 models capture the seasonality well but they don’t capture the trend so well. It is surprising that Auto Arima failed to capture the trend well since its supposed to run all the potential models and find the optimal one.
a1 = accuracy(fit1, test)
a2 = accuracy(fit2, test)
a3 = accuracy(fit3, test)
a4 = accuracy(fit4, test)
a5 = accuracy(fit5, test)
a.table<-rbind(a1, a2, a3, a4, a5)
row.names(a.table) = c('TSLM training','TSLM test','STL training','STL test','ETS training','ETS test','ARIMA training','ARIMA test','RW training','RW test')
a.table<-as.data.frame(a.table)
a.table<-a.table[order(a.table$MASE),]
a.table
## ME RMSE MAE MPE MAPE
## ARIMA training 4.455841e-02 3.521420 2.617688 -0.03631179 2.413840
## ETS training 1.063591e-02 3.611275 2.722486 -0.05983491 2.510413
## STL training -1.520768e-15 3.747449 2.830674 -0.04703100 2.610017
## RW training 3.803327e-16 7.776956 5.776888 -0.24697590 5.278488
## STL test 5.535326e+00 7.838843 6.593663 4.90642090 5.801502
## TSLM training 1.467255e-15 8.181900 6.969203 -0.60118175 6.544394
## ETS test 6.290508e+00 8.486246 7.052546 5.55210633 6.209171
## ARIMA test 7.709842e+00 9.769035 8.203006 6.81318414 7.242044
## TSLM test 9.619135e+00 11.327962 9.689007 8.48741430 8.544645
## RW test -1.495833e+01 16.938159 14.958335 -13.70688327 13.706883
## MASE ACF1 Theil's U
## ARIMA training 0.5156512 0.000283213 NA
## ETS training 0.5362951 0.038634101 NA
## STL training 0.5576066 -0.231769996 NA
## RW training 1.1379732 0.257288876 NA
## STL test 1.2988675 0.651999435 1.359940
## TSLM training 1.3728441 0.891666568 NA
## ETS test 1.3892615 0.673704404 1.474194
## ARIMA test 1.6158874 0.711762211 1.708774
## TSLM test 1.9086107 0.698444987 1.970136
## RW test 2.9466009 0.734942375 3.062910
For the purpose of this question, ETS performed the best since it had a lower MAPE and MASE than the other models. However, there are other models, such as SLT, that might fit better with the data. The results show that SLT has a lower MAPE and MASE than ETS. ARIMA was close behind the ETS model.
train <- window(y,start=c(1990, 1),end=c(2019), time=8)
test <- window(y,start=c(2019), time=9)
both <- window(y,start=c(1990, 1))
h=length(test)
ETS <- ets(train, model="ZZZ")
fit2 <- forecast(ETS, h=13)
autoplot(y, series = "Actual") +
ggtitle("Total Sugar production") +
autolayer(fit2, PI = FALSE, series = "ETS Forecast") +
ylab("Monthly Sugar Production (Million Metric Tons)")+
theme(plot.title = element_text(size=15, face = "bold",hjust = 0.5) ,
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10))
The forecasted values from September 2019 to September 2020 are very similar to the actual data. This forecast can be quite useful for predicting monthly sugar production.
# ETS method
ets_function <- function(x, h) {
ETS <- ets(x, model="ZZZ")
forecast(ETS, h=h)
}
# STL Method
stl_function <- function(x, h) {
STL <- stl(x, t.window=15, s.window="periodic", robust=TRUE)
forecast(STL, method="rwdrift",h=h)
}
# trend plus seasonal
tslm_function <- function(x, h) {
tps <- tslm(x ~ trend + season)
forecast(tps,h=h)
}
## arima
arima_function <- function(x, h) {
forecast(auto.arima(x), h=h)
}
## Random Walk
#rwf_function <- function(x, h) {
# forecast(rwf, h=h, drift = TRUE)
#}
##Time Series Cross Validation
I windowized the first 60 observations in the data which is from the start of 1990 to the end of 1994.
window_data <- window(y, end=c(1994,12))
# Calling functions and generating tsCV
e1 <- tsCV(window_data, ets_function, h=6)
e2 <- tsCV(window_data, stl_function, h=6)
e3 <- tsCV(window_data, tslm_function, h=6)
e4 <- tsCV(window_data, arima_function, h=6)
f5 <- tsCV(window_data, rwf, drift=TRUE, h=6)
mse1 = mean(e1^2, na.rm=TRUE)
mse2 = mean(e2^2, na.rm=TRUE)
mse3 = mean(e3^2, na.rm=TRUE)
mse4 = mean(e4^2, na.rm=TRUE)
mse5 = mean(f5^2, na.rm=TRUE)
a.table<-rbind(mse1, mse2, mse3, mse4, mse5)
row.names(a.table)<-c('ETS MSE', 'STL MSE', 'TSLM MSE', 'ARIMA MSE', 'RWF MSE')
a.table<-as.data.frame(a.table)
a.table
## V1
## ETS MSE 207.09058
## STL MSE 14.65243
## TSLM MSE 55.95463
## ARIMA MSE 319.42737
## RWF MSE 476.07160
To produce the forecasts, we will use loops
models = c("ets_function","stl_function","tslm_function","arima_function")
for (mdl in models) {
mse = c()
for (i in c(1,2,3,4,5,6)){
e <- tsCV(window_data, get(mdl), h=i)
mse[i] = mean(e^2, na.rm=TRUE)
}
plot(mse,ann=FALSE)+
title(main=mdl, col.main="red", font.main=4)+
title(xlab="h")+
title(ylab="MSE")
}
models = c("rwf")
for (mdl in models) {
mse = c()
for (i in c(1,2,3,4,5,6)){
f <- tsCV(window_data, get(mdl), h=i)
mse[i] = mean(f^2, na.rm=TRUE)
}
plot(mse,ann=FALSE)+
title(main=mdl, col.main="red", font.main=4)+
title(xlab="h")+
title(ylab="MSE")
}
In each of the models, MSE increases as the forecast horizon increases. There is a strong and positive relationship between MSE and h (the forecast horizon). Surprisingly, the ETS model received the third largest MSE, even though it performed very well in the first part of the question. Auto Arima and Random Walk performed the worst with the highest MSE, which is similar to the results in part A. Suprisingly, the linear model (TSLM) performed well, achieveing the second lowest MSE behind the STL model. This is also contrary to the results found in Part A of the question.
Overall, the STL model performed the best and had the smallest MSE which was approximately 14. This is similar to the results found in part A as it got the smallest MAPE and MASE. This makes sense because STL fits localized curves to fit the data and smooth out the regression. As a result, the STL model fit well since it is able to capture all the little jumps, spikes and wiggles in the data.