printing the basic features from each series, producing a plot:
# Our referenced series from M4 competition: Y500-Y509
count <- 500
for (series in c("Y500","Y501","Y502","Y503","Y504","Y505","Y506","Y507","Y508","Y509")){
assign(paste("ts", series, sep = "_"), ts(c(M4[[count]]$x, M4[[count]]$xx),start = start(M4[[count]]$x), frequency = frequency(M4[[count]]$x)))
cat("\nthe name of the series is:", paste("ts", series, sep = "_"))
cat("\nthe length of the series is:" ,length(get(paste("ts", series, sep = "_"))))
cat("\nthe frequancy of the series is:" ,frequency(get(paste("ts", series, sep = "_"))))
plot(get(paste("ts", series, sep = "_")),main=paste("Plotting Time Series", series, sep = " "), ylab = "Value", type = "l")
count <- count + 1
cat("\n")
}
##
## the name of the series is: ts_Y500
## the length of the series is: 55
## the frequancy of the series is: 1
##
##
## the name of the series is: ts_Y501
## the length of the series is: 49
## the frequancy of the series is: 1
##
##
## the name of the series is: ts_Y502
## the length of the series is: 35
## the frequancy of the series is: 1
##
##
## the name of the series is: ts_Y503
## the length of the series is: 35
## the frequancy of the series is: 1
##
##
## the name of the series is: ts_Y504
## the length of the series is: 55
## the frequancy of the series is: 1
##
##
## the name of the series is: ts_Y505
## the length of the series is: 40
## the frequancy of the series is: 1
##
##
## the name of the series is: ts_Y506
## the length of the series is: 40
## the frequancy of the series is: 1
##
##
## the name of the series is: ts_Y507
## the length of the series is: 40
## the frequancy of the series is: 1
##
##
## the name of the series is: ts_Y508
## the length of the series is: 35
## the frequancy of the series is: 1
##
##
## the name of the series is: ts_Y509
## the length of the series is: 50
## the frequancy of the series is: 1
Produce a forecast for each of the series (Y500-Y509), based of the 12 methods from M4 competition:
# Models function creation
fh <- 6 #The forecasting horizon examined
frq <- 1 #The frequency of the data
smape_cal <- function(outsample, forecasts){
#Used to estimate sMAPE
outsample <- as.numeric(outsample) ; forecasts<-as.numeric(forecasts)
smape <- (abs(outsample-forecasts)*200)/(abs(outsample)+abs(forecasts))
return(smape)
}
mase_cal <- function(insample, outsample, forecasts){
#Used to estimate MASE
frq <- frequency(insample)
forecastsNaiveSD <- rep(NA,frq)
for (j in (frq+1):length(insample))
{
forecastsNaiveSD <- c(forecastsNaiveSD, insample[j-frq])
}
masep<-mean(abs(insample-forecastsNaiveSD),na.rm = TRUE)
outsample <- as.numeric(outsample)
forecasts <- as.numeric(forecasts)
mase <- (abs(outsample-forecasts))/masep
return(mase)
}
naive_seasonal <- function(input, fh){
#Used to estimate Seasonal Naive
frcy <- frequency(input)
frcst <- naive(input, h=fh)$mean
if (frcy>1)
{
frcst <- head(rep(as.numeric(tail(input,frcy)), fh), fh) + frcst - frcst
}
return(frcst)
}
Theta.classic <- function(input, fh){
#Used to estimate Theta classic
#Set parameters
wses <- wlrl<-0.5 ; theta <- 2
#Estimate theta line (0)
observations <- length(input)
xt <- c(1:observations)
xf <- c((observations+1):(observations+fh))
train <- data.frame(input=input, xt=xt)
test <- data.frame(xt = xf)
estimate <- lm(input ~ poly(xt, 1, raw=TRUE))
thetaline0In <- as.numeric(predict(estimate))
thetaline0Out <- as.numeric(predict(estimate,test))
#Estimate theta line (2)
thetalineT <- theta*input+(1-theta)*thetaline0In
sesmodel <- ses(thetalineT, h=fh)
thetaline2In <- sesmodel$fitted
thetaline2Out <- sesmodel$mean
#Theta forecasts
forecastsIn <- (thetaline2In*wses)+(thetaline0In*wlrl)
forecastsOut <- (thetaline2Out*wses)+(thetaline0Out*wlrl)
#Zero forecasts become positive
for (i in 1:length(forecastsOut))
{
if (forecastsOut[i]<0)
{
forecastsOut[i]<-0
}
}
output=list(fitted = forecastsIn, mean = forecastsOut,
fitted0 = thetaline0In, mean0 = thetaline0Out,
fitted2 = thetaline2In, mean2 = thetaline2Out)
return(output)
}
SeasonalityTest <- function(input, ppy){
#Used to determine whether a time series is seasonal
tcrit <- 1.645
if (length(input)<3*ppy)
{
test_seasonal <- FALSE
}
else
{
xacf <- acf(input, plot = FALSE)$acf[-1, 1, 1]
clim <- tcrit/sqrt(length(input)) * sqrt(cumsum(c(1, 2 * xacf^2)))
test_seasonal <- ( abs(xacf[ppy]) > clim[ppy] )
if (is.na(test_seasonal)==TRUE)
{
test_seasonal <- FALSE
}
}
return(test_seasonal)
}
#Models Benchmarks Function
Benchmarks <- function(input, h){
#Used to estimate the statistical benchmarks of the M4 competition
#Estimate seasonaly adjusted time series
ppy <- frequency(input)
ST <- F
if (ppy>1)
{
ST <- SeasonalityTest(input,ppy)
}
if (ST==T)
{
Dec <- decompose(input,type="multiplicative")
des_input <- input/Dec$seasonal
SIout <- head(rep(Dec$seasonal[(length(Dec$seasonal)-ppy+1):length(Dec$seasonal)], fh), fh)
}
else
{
des_input <- input
SIout <- rep(1, fh)
}
naive <- naive(input, h=h)$mean #Naive
naive_s <- naive_seasonal(input, fh=h) #Seasonal Naive
naive2 <- naive(des_input, h=h)$mean*SIout #Naive2
ses <- ses(des_input, h=h)$mean*SIout #Ses
holt <- holt(des_input, h=h, damped=F)$mean*SIout #Holt
damp <- holt(des_input, h=h, damped=T)$mean*SIout #Damped
theta <- Theta.classic(input=des_input, fh=h)$mean*SIout #Theta
comb <- (ses+holt+damp)/3 #Comb
ets <- forecast(ets(des_input), h=fh, level =0)$mean*SIout #ets
arima <- forecast(auto.arima(des_input), h=fh, level =0)$mean*SIout #arima
mlp <- forecast(nnetar(des_input), h=fh, level =0)$mean*SIout #mlp
return(list(naive,naive_s,naive2,ses,holt,damp,theta,comb,ets,arima,mlp))
}
Names_benchmarks <- c("Naive", "sNaive", "Naive2", "SES", "Holt", "Damped", "Theta", "Comb", "ets","arima","mlp")
model_eval_func <- function(ts_count)
{
cat("\nthe name of the series is:", paste("ts", ts_count, sep = "_"))
Names_benchmarks <- c("Naive", "sNaive", "Naive2", "SES", "Holt", "Damped", "Theta", "Comb", "ets","arima","mlp")
data_train <- NULL
data_test <- NULL
data_all <- assign(paste("ts", ts_count, sep = "_"), ts(c(M4[[ts_count]]$x, M4[[ts_count]]$xx),start = start(M4[[ts_count]]$x), frequency = frequency(M4[[ts_count]]$x)))
data_train[length(data_train)+1] <- list(ts(head(data_all,length(data_all)-fh),frequency = frq))
data_test[length(data_test)+1] <- list(tail(data_all,fh))
SMAPE_df <- data.frame(Names_benchmarks)
MASE_df <- data.frame(Names_benchmarks)
OWA_df <- data.frame(Names_benchmarks)
Total_smape <- array(NA,dim = c(length(Names_benchmarks), fh, length(data_train)))
Total_mase <- array(NA,dim = c(length(Names_benchmarks), fh, length(data_train)))
#Methods, Horizon, time-series
for (i in 1:length(data_train)){
insample <- data_train[[i]]
outsample <- data_test[[i]]
forecasts <- Benchmarks(input=insample, h=fh)
#SMAPE
for (j in 1:length(Names_benchmarks)){
Total_smape[j,,i] <- smape_cal(outsample, forecasts[[j]])
}
#MASE
for (j in 1:length(Names_benchmarks)){
Total_mase[j,,i] <- mase_cal(insample, outsample, forecasts[[j]])
}
}
print("-------------------- sMAPE")
for (i in 1:length(Names_benchmarks)){
SMAPE <- round(mean(Total_smape[i,,]), 3)
SMAPE_df[i,2] <- SMAPE
print(paste(Names_benchmarks[i], SMAPE))
}
print("\n-------------------- MASE")
for (i in 1:length(Names_benchmarks)){
MASE <- round(mean(Total_mase[i,,]), 3)
MASE_df[i,2] <- MASE
print(paste(Names_benchmarks[i], MASE))
}
print("-------------------- OWA")
for (i in 1:length(Names_benchmarks)){
OWA <- round(((mean(Total_mase[i,,])/mean(Total_mase[3,,]))+(mean(Total_smape[i,,])/mean(Total_smape[3,,])))/2, 3)
OWA_df[i,2] <- OWA
print(paste(Names_benchmarks[i],OWA))
}
grouped_df <- data.frame(MASE_df$V2,SMAPE_df$V2,OWA_df$V2)
barplot(t(as.matrix(grouped_df)), names.arg = Names_benchmarks , cex.names = 0.6 ,beside=TRUE, main=paste("Visual Compresion Y", ts_count, " Series", sep = " "), legend = c("MASE", "SMAPE", "OWA"))
return()
}
for (series in 500:509){
x <- model_eval_func(series)
}
##
## the name of the series is: ts_500[1] "-------------------- sMAPE"
## [1] "Naive 0.84"
## [1] "sNaive 0.84"
## [1] "Naive2 0.84"
## [1] "SES 0.84"
## [1] "Holt 0.805"
## [1] "Damped 0.722"
## [1] "Theta 1.079"
## [1] "Comb 0.242"
## [1] "ets 0.805"
## [1] "arima 2.782"
## [1] "mlp 0.484"
## [1] "\n-------------------- MASE"
## [1] "Naive 0.798"
## [1] "sNaive 0.798"
## [1] "Naive2 0.798"
## [1] "SES 0.798"
## [1] "Holt 0.773"
## [1] "Damped 0.693"
## [1] "Theta 1.037"
## [1] "Comb 0.231"
## [1] "ets 0.773"
## [1] "arima 2.702"
## [1] "mlp 0.463"
## [1] "-------------------- OWA"
## [1] "Naive 1"
## [1] "sNaive 1"
## [1] "Naive2 1"
## [1] "SES 1"
## [1] "Holt 0.964"
## [1] "Damped 0.864"
## [1] "Theta 1.292"
## [1] "Comb 0.289"
## [1] "ets 0.964"
## [1] "arima 3.35"
## [1] "mlp 0.578"
##
## the name of the series is: ts_501[1] "-------------------- sMAPE"
## [1] "Naive 2.376"
## [1] "sNaive 2.376"
## [1] "Naive2 2.376"
## [1] "SES 2.418"
## [1] "Holt 3.308"
## [1] "Damped 2.08"
## [1] "Theta 2.12"
## [1] "Comb 2.115"
## [1] "ets 3.311"
## [1] "arima 3.301"
## [1] "mlp 2.107"
## [1] "\n-------------------- MASE"
## [1] "Naive 0.881"
## [1] "sNaive 0.881"
## [1] "Naive2 0.881"
## [1] "SES 0.897"
## [1] "Holt 1.245"
## [1] "Damped 0.771"
## [1] "Theta 0.786"
## [1] "Comb 0.785"
## [1] "ets 1.246"
## [1] "arima 1.242"
## [1] "mlp 0.782"
## [1] "-------------------- OWA"
## [1] "Naive 1"
## [1] "sNaive 1"
## [1] "Naive2 1"
## [1] "SES 1.018"
## [1] "Holt 1.403"
## [1] "Damped 0.875"
## [1] "Theta 0.892"
## [1] "Comb 0.89"
## [1] "ets 1.404"
## [1] "arima 1.4"
## [1] "mlp 0.887"
##
## the name of the series is: ts_502[1] "-------------------- sMAPE"
## [1] "Naive 15.965"
## [1] "sNaive 15.965"
## [1] "Naive2 15.965"
## [1] "SES 15.965"
## [1] "Holt 6.563"
## [1] "Damped 9.478"
## [1] "Theta 11.21"
## [1] "Comb 10.58"
## [1] "ets 6.563"
## [1] "arima 6.607"
## [1] "mlp 13.502"
## [1] "\n-------------------- MASE"
## [1] "Naive 4.855"
## [1] "sNaive 4.855"
## [1] "Naive2 4.855"
## [1] "SES 4.855"
## [1] "Holt 2.078"
## [1] "Damped 2.972"
## [1] "Theta 3.488"
## [1] "Comb 3.302"
## [1] "ets 2.078"
## [1] "arima 2.092"
## [1] "mlp 4.157"
## [1] "-------------------- OWA"
## [1] "Naive 1"
## [1] "sNaive 1"
## [1] "Naive2 1"
## [1] "SES 1"
## [1] "Holt 0.42"
## [1] "Damped 0.603"
## [1] "Theta 0.71"
## [1] "Comb 0.671"
## [1] "ets 0.42"
## [1] "arima 0.422"
## [1] "mlp 0.851"
##
## the name of the series is: ts_503[1] "-------------------- sMAPE"
## [1] "Naive 15.216"
## [1] "sNaive 15.216"
## [1] "Naive2 15.216"
## [1] "SES 15.216"
## [1] "Holt 7.444"
## [1] "Damped 9.728"
## [1] "Theta 11.331"
## [1] "Comb 10.734"
## [1] "ets 7.445"
## [1] "arima 7.314"
## [1] "mlp 11.722"
## [1] "\n-------------------- MASE"
## [1] "Naive 4.812"
## [1] "sNaive 4.812"
## [1] "Naive2 4.812"
## [1] "SES 4.812"
## [1] "Holt 2.438"
## [1] "Damped 3.159"
## [1] "Theta 3.652"
## [1] "Comb 3.47"
## [1] "ets 2.439"
## [1] "arima 2.397"
## [1] "mlp 3.773"
## [1] "-------------------- OWA"
## [1] "Naive 1"
## [1] "sNaive 1"
## [1] "Naive2 1"
## [1] "SES 1"
## [1] "Holt 0.498"
## [1] "Damped 0.648"
## [1] "Theta 0.752"
## [1] "Comb 0.713"
## [1] "ets 0.498"
## [1] "arima 0.489"
## [1] "mlp 0.777"
##
## the name of the series is: ts_504[1] "-------------------- sMAPE"
## [1] "Naive 13.912"
## [1] "sNaive 13.912"
## [1] "Naive2 13.912"
## [1] "SES 13.913"
## [1] "Holt 21.76"
## [1] "Damped 14.396"
## [1] "Theta 12.039"
## [1] "Comb 9.834"
## [1] "ets 19.372"
## [1] "arima 6.758"
## [1] "mlp 35.424"
## [1] "\n-------------------- MASE"
## [1] "Naive 4.532"
## [1] "sNaive 4.532"
## [1] "Naive2 4.532"
## [1] "SES 4.532"
## [1] "Holt 8.389"
## [1] "Damped 5.17"
## [1] "Theta 3.951"
## [1] "Comb 3.394"
## [1] "ets 7.33"
## [1] "arima 2.187"
## [1] "mlp 15.311"
## [1] "-------------------- OWA"
## [1] "Naive 1"
## [1] "sNaive 1"
## [1] "Naive2 1"
## [1] "SES 1"
## [1] "Holt 1.708"
## [1] "Damped 1.088"
## [1] "Theta 0.869"
## [1] "Comb 0.728"
## [1] "ets 1.505"
## [1] "arima 0.484"
## [1] "mlp 2.963"
##
## the name of the series is: ts_505[1] "-------------------- sMAPE"
## [1] "Naive 23.832"
## [1] "sNaive 23.832"
## [1] "Naive2 23.832"
## [1] "SES 23.831"
## [1] "Holt 15.007"
## [1] "Damped 7.051"
## [1] "Theta 19.461"
## [1] "Comb 9.855"
## [1] "ets 10.142"
## [1] "arima 14.7"
## [1] "mlp 19.163"
## [1] "\n-------------------- MASE"
## [1] "Naive 7.295"
## [1] "sNaive 7.295"
## [1] "Naive2 7.295"
## [1] "SES 7.295"
## [1] "Holt 4.786"
## [1] "Damped 2.551"
## [1] "Theta 6.083"
## [1] "Comb 3.225"
## [1] "ets 3.294"
## [1] "arima 4.694"
## [1] "mlp 6.005"
## [1] "-------------------- OWA"
## [1] "Naive 1"
## [1] "sNaive 1"
## [1] "Naive2 1"
## [1] "SES 1"
## [1] "Holt 0.643"
## [1] "Damped 0.323"
## [1] "Theta 0.825"
## [1] "Comb 0.428"
## [1] "ets 0.439"
## [1] "arima 0.63"
## [1] "mlp 0.814"
##
## the name of the series is: ts_506[1] "-------------------- sMAPE"
## [1] "Naive 24.053"
## [1] "sNaive 24.053"
## [1] "Naive2 24.053"
## [1] "SES 24.052"
## [1] "Holt 14.765"
## [1] "Damped 12.461"
## [1] "Theta 19.623"
## [1] "Comb 16.956"
## [1] "ets 13.729"
## [1] "arima 14.928"
## [1] "mlp 18.284"
## [1] "\n-------------------- MASE"
## [1] "Naive 7.177"
## [1] "sNaive 7.177"
## [1] "Naive2 7.177"
## [1] "SES 7.177"
## [1] "Holt 4.592"
## [1] "Damped 3.91"
## [1] "Theta 5.977"
## [1] "Comb 5.227"
## [1] "ets 4.287"
## [1] "arima 4.64"
## [1] "mlp 5.614"
## [1] "-------------------- OWA"
## [1] "Naive 1"
## [1] "sNaive 1"
## [1] "Naive2 1"
## [1] "SES 1"
## [1] "Holt 0.627"
## [1] "Damped 0.531"
## [1] "Theta 0.824"
## [1] "Comb 0.717"
## [1] "ets 0.584"
## [1] "arima 0.634"
## [1] "mlp 0.771"
##
## the name of the series is: ts_507[1] "-------------------- sMAPE"
## [1] "Naive 23.096"
## [1] "sNaive 23.096"
## [1] "Naive2 23.096"
## [1] "SES 23.095"
## [1] "Holt 14.18"
## [1] "Damped 12.519"
## [1] "Theta 18.861"
## [1] "Comb 16.48"
## [1] "ets 23.095"
## [1] "arima 14.231"
## [1] "mlp 18.574"
## [1] "\n-------------------- MASE"
## [1] "Naive 6.772"
## [1] "sNaive 6.772"
## [1] "Naive2 6.772"
## [1] "SES 6.772"
## [1] "Holt 4.333"
## [1] "Damped 3.853"
## [1] "Theta 5.644"
## [1] "Comb 4.986"
## [1] "ets 6.772"
## [1] "arima 4.348"
## [1] "mlp 5.571"
## [1] "-------------------- OWA"
## [1] "Naive 1"
## [1] "sNaive 1"
## [1] "Naive2 1"
## [1] "SES 1"
## [1] "Holt 0.627"
## [1] "Damped 0.556"
## [1] "Theta 0.825"
## [1] "Comb 0.725"
## [1] "ets 1"
## [1] "arima 0.629"
## [1] "mlp 0.813"
##
## the name of the series is: ts_508[1] "-------------------- sMAPE"
## [1] "Naive 77.788"
## [1] "sNaive 77.788"
## [1] "Naive2 77.788"
## [1] "SES 77.786"
## [1] "Holt 72.665"
## [1] "Damped 74.1"
## [1] "Theta 77.037"
## [1] "Comb 74.831"
## [1] "ets 77.786"
## [1] "arima 117.61"
## [1] "mlp 112.979"
## [1] "\n-------------------- MASE"
## [1] "Naive 14.459"
## [1] "sNaive 14.459"
## [1] "Naive2 14.459"
## [1] "SES 14.459"
## [1] "Holt 13.766"
## [1] "Damped 13.966"
## [1] "Theta 14.359"
## [1] "Comb 14.063"
## [1] "ets 14.459"
## [1] "arima 19.017"
## [1] "mlp 18.539"
## [1] "-------------------- OWA"
## [1] "Naive 1"
## [1] "sNaive 1"
## [1] "Naive2 1"
## [1] "SES 1"
## [1] "Holt 0.943"
## [1] "Damped 0.959"
## [1] "Theta 0.992"
## [1] "Comb 0.967"
## [1] "ets 1"
## [1] "arima 1.414"
## [1] "mlp 1.367"
##
## the name of the series is: ts_509[1] "-------------------- sMAPE"
## [1] "Naive 9.066"
## [1] "sNaive 9.066"
## [1] "Naive2 9.066"
## [1] "SES 9.066"
## [1] "Holt 1.984"
## [1] "Damped 1.435"
## [1] "Theta 6.971"
## [1] "Comb 1.731"
## [1] "ets 4.824"
## [1] "arima 2.209"
## [1] "mlp 1.018"
## [1] "\n-------------------- MASE"
## [1] "Naive 4.129"
## [1] "sNaive 4.129"
## [1] "Naive2 4.129"
## [1] "SES 4.129"
## [1] "Holt 0.97"
## [1] "Damped 0.698"
## [1] "Theta 3.212"
## [1] "Comb 0.82"
## [1] "ets 2.246"
## [1] "arima 1.08"
## [1] "mlp 0.489"
## [1] "-------------------- OWA"
## [1] "Naive 1"
## [1] "sNaive 1"
## [1] "Naive2 1"
## [1] "SES 1"
## [1] "Holt 0.227"
## [1] "Damped 0.164"
## [1] "Theta 0.773"
## [1] "Comb 0.195"
## [1] "ets 0.538"
## [1] "arima 0.253"
## [1] "mlp 0.115"
A brief summary of the article- The article we chose, ‘Fast and accurate yearly time series forecasting with forecast combinations by David Shaub’ discussing different aspects of the authors’ submission for the m4 competition, which ranked third on yearly time series, outperforming many other competitors. His submission was based on an simplistic ensembled model, which combined 3 of the models provided by the ‘‘forecast’’ package (Hyndman & Khandakar, 2008): auto.arima, thetaf, and tbats (with equal weights for each model). The discussion in the article is divided into few sections, and mainly discusses the forecast methodology and its advantages, also providing an analysis of the forecast performance (compared to other models as benchmarks). Shaub explains that the relatively high accuracy of the ensembled model (compared to each model individually, and to more complex metrics in general) comes mainly from reducing the specification error that gets from each model individually. He also emphasises the simplicity of his forecasting approach, which can be implemented easily by any business/ user which is interested in forecasting a large number of time series automatically, without intervention, and without a global model that needs retraining.
Aspect to be explored in the article- We decided to explore the specific forecasting method described in the article, based on it’s attached code file; we will forecasted our 10 chosen series (Y500-Y509, yearly series) using the same methodology Shaub used, which based on ‘forecastHybrid’’ package (Shaub & Ellis, 2018) and ‘hybridModel’ function from this package. We will compare our models’ performances to Holt-Winter exponential smoothing with ‘ZZZ’ argument, for maximum flexibility to be able to adjust all different series and its patterns.
args = commandArgs(trailingOnly = TRUE)
inputs <- c("Yearly")
paths <- paste0("~", inputs, "-train.csv")
combineForecasts <- function(forecastPI, forecastsPoint){
results <- forecastPI
for(ind in seq_along(results)){
extracted <- lapply(forecastsPoint, function(x) x[[ind]]$mean)
extracted <- extracted[!sapply(extracted, is.null)]
if(length(extracted) > 0){
combined <- ts(rowMeans(data.frame(extracted), na.rm = TRUE))
tsp(combined) <- tsp(forecastPI[[ind]]$mean)
results[[ind]]$mean <- combined
}
}
return(results)
}
# Extract the dataframe to a list of msts objects
extractList <- function(x, seriesName){
tab <- c( "Yearly" = 1)
seriesFrequency <- tab[seriesName]
tab <- list("Yearly" = c(1))
multSeason <- tab[[seriesName]]
cleaned <- as.numeric(x[!is.na(as.numeric(x))])
series <- msts(cleaned, seasonal.periods = multSeason, ts.frequency = seriesFrequency)
return(series)
}
thiefForecast <- function(x, horizon, model){
if(length(x) < 2 * frequency(x)){
return(NULL)
}
return(thief(y = x, h = horizon, usemodel = model))
}
writeResults <- function(forecastList, seriesName, floorZero = TRUE){
components <- c("mean", "lower", "upper")
for(component in components){
baseDir <- paste0("~/", component)
dir.create(file.path(baseDir), showWarnings = FALSE)
forecasts <- data.frame(t(data.frame(lapply(forecastList, FUN = function(x) x[[component]]))))
forecasts <- round(forecasts, 4)
if(floorZero){
forecasts[forecasts < 0] <- 0
}
rownames(forecasts) <- names(forecastList)
filename <- paste0(baseDir, "/", seriesName, "-", component, ".csv")
write.table(x = forecasts, file = filename, quote = FALSE, sep = ",", col.names = FALSE)
}
}
inputPath <- "Yearly-train.csv"
dat <- fread(inputPath, header = TRUE, data.table = FALSE)
range <- c("Y500","Y501","Y502","Y503","Y504","Y505","Y506","Y507","Y508","Y509")
for (row in range) {
# Train Set
inputPath <- "Yearly-train.csv"
train_set <- fread(inputPath, header = TRUE, data.table = FALSE)
train_set <- train_set[train_set$V1 %in% row,]
seriesNames <- train_set[, 1]
train_set <- train_set[, -1]
# Validation Set
inputPath <- "Yearly-test.csv"
validation_set <- fread(inputPath, header = TRUE, data.table = FALSE)
validation_set <- validation_set[validation_set$V1 %in% row,]
seriesNames <- validation_set[, 1]
validation_set <- validation_set[, -1]
validation_set <- as_tibble(validation_set)
# Extract series names and remove from dataframe
currentSeries <- "Yearly"
h <- 6
# Transform to list
train_set <- apply(train_set, MARGIN = 1, FUN = function(x) extractList(x, currentSeries))
names(train_set) <- seriesNames
models <- "aft"
model <- hybridModel(train_set, models = models, verbose = FALSE)
forecasts <-forecast(model, h = h, level = 95,PI.combination = "mean")
#Model Errors
accurecy_model <- accuracy(forecasts, t(validation_set))
RMSE <- round(accurecy_model[2,2],2)
sMAPE <- round(mean(smape_cal(validation_set, forecasts$mean)),2)
MASE <- round(mean(mase_cal(train_set, validation_set,forecasts$mean)),2)
# Plot
plot(train_set, main = c(paste0("Series ",row)), sub =c(paste0("RMSE ",RMSE," MASE ",MASE, " SMAPE ",sMAPE,'%')), xlab="Year", ylab="Number", ylim = range(min(train_set),max(forecasts$mean)),xlim = range(0,length(forecasts$mean)+length(train_set)))
#lines(model_pred$mean, lwd = 2, lty=2, col = "green")
lines(forecasts$mean, lwd = 2, lty=2, col = "red")
lines(forecasts$fitted, lwd = 2, lty=2, col = "green")
legend("bottomright", c("train real", "Test Prediction", "Train Prediction"), lty=c(1,2,2), lwd=c(1,2,2),col = c("black","red","green"), bty = "n",cex = 0.8)
}
The results of the exploration- pros-
can protect against model selection risks and prevent some large
forecasting errors. easy to implement and to automate Flexibility on
models selection and weighting according to the data and business case.
Users with access to a large number of CPU cores or threads will
particularly benefit from this methodology since both training and
forecasting scale linearly with the number of processes, thus reducing
the time required to forecast a collection of time series.
Cons- The methodology is mostly effective on yearly time series,
therefore is not applicable for a lot of use cases and datasets.
Using user-supplied weights in ‘hybridModel’’ may cause overfitting, for
that reason cross validation is recommended.
Conclusion: We choose to analyze an yearly series (Y500 - Y509) from M4 competition by veriues of modeled in order to choose the best model that will capture the yearly trend and will have the lowest errors rate. The models we decided to test the data as are:
[1] “Naive” [2] “sNaive” [3] “Naive2” [4] “SES” [5] “Holt” [6] “Damped” [7] “Theta” [8] “Comb” [9] “ets” [10] “arima”
and the metrics we used in order to distinguish if the models is fit to the data or not are - OWA, SMAPE and RMSE. In question number 3, we used the model hybrid Model that allow us to anssable many types of models in one line of code. The models we decided to combine using the hybrid function are auto.arime, thtam and tbats.The key errors we measure the model by are sMAPE, MASE and RMSE. In conclusion, the results of the hybrid model in question 3 didnt preformed better then the 1 of the 12 models in question 4, the gaps between the MASE and sMAPE error are minors, but still the models out preformed in comparision to the hybrid model.