Summary

Time series are analysed to understand the past and to predict the future, enabling managers or policy makers to make properly informed decisions. Here a subset of typical time series forecasting models are compared. This data is for monthly anti-diabetic drug sales in Australia from 1992 to 2008. Total monthly scripts for pharmaceutical products falling under ATC code A10, as recorded by the Australian Health Insurance Commission.

Questions

  1. What is a good model to predict the drug sales?

  2. How do the models compare with different error metrics?


library(forecast)
library(fpp)
library(ggplot2)
library(gridExtra)


First I will plot the time series of interest to see what it looks like. We can see that the trend is up and that there is a clear seasonal pattern in the series.

ts_all <- a10

ggplot(ts_all, aes(as.Date(ts_all), as.matrix(ts_all))) + 
  geom_line(colour = "red")+
  ylab("Sales") + 
  xlab("Date") + 
  ggtitle("Monthly anti-diabetic drug sales")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), legend.key = element_rect(colour = "black"))


Next I will decompose the time series into seasonal, trend and irregular components using moving averages. Here the trend and seasonal patterns are clearer.

temp <- decompose(ts_all)
tst <- data.frame(actual = as.matrix(temp$x), date = as.Date(temp$x), seasonal = as.matrix(temp$seasonal),
                   trend = as.matrix(temp$trend), random = as.matrix(temp$random), type = temp$type)

a <- ggplot(tst, aes(tst[,2], tst[,1]))+
  geom_line()+
  ylab("observed") + 
  xlab("") + 
  ggtitle(paste("Decomposition of",  tst$type, "time series", sep=" "))+
  scale_colour_gradient2(low = "blue", mid = "red",high = "black", midpoint = mean(tst[,1]), name="Tooth Length")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), 
        axis.title = element_text(colour = 'black'), legend.text=element_text(), 
        legend.title=element_text(), legend.key = element_rect(colour = "black"), 
        plot.margin=unit(c(0,0.5,-0.4,0.5), "cm"))

b <- ggplot(tst, aes(tst[,2], tst[,3]))+
  geom_line()+
  ylab("seasonal") + 
  xlab("") + 
  ggtitle("")+
  scale_colour_gradient2(low = "blue", mid = "red",high = "black", midpoint = mean(tst[,1]), name="Tooth Length")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), 
        axis.title = element_text(colour = 'black'), legend.text=element_text(), 
        legend.title=element_text(), legend.key = element_rect(colour = "black"),
        plot.margin=unit(c(0,0.5,-0.4,0.5), "cm"))

c <- ggplot(tst, aes(tst[,2], tst[,4]))+
  geom_line()+
  ylab("trend") + 
  xlab("") + 
  ggtitle("")+
  scale_colour_gradient2(low = "blue", mid = "red",high = "black", midpoint = mean(tst[,1]), name="Tooth Length")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), 
        axis.title = element_text(colour = 'black'), legend.text=element_text(), 
        legend.title=element_text(), legend.key = element_rect(colour = "black"),
        plot.margin=unit(c(0,0.5,-0.4,0.5), "cm"))

d <- ggplot(tst, aes(tst[,2], tst[,5]))+
  geom_line()+
  ylab("random") + 
  xlab("") + 
  ggtitle("")+
  scale_colour_gradient2(low = "blue", mid = "red",high = "black", midpoint = mean(tst[,1]), name="Tooth Length")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), 
        axis.title = element_text(colour = 'black'), legend.text=element_text(), 
        legend.title=element_text(), legend.key = element_rect(colour = "black"),
        plot.margin=unit(c(0,0.5,-0.4,0.5), "cm"))

grid.arrange(a,b,c,d, ncol = 1)


To further look at the effect seasonality has on the series, I plot the sales values per month. We can see that Januray has the largest spread and median value and that the sales drop in February. The sales increase slightly towards December.

tst <- data.frame(cbind(as.matrix(ts_all), as.matrix(cycle(ts_all))))

ggplot(tst, aes(as.factor(tst[,2]), tst[,1])) +
  ylab("Sales") + 
  xlab("Month") + 
  ggtitle("Sales for each month") + 
  stat_boxplot(geom ='errorbar')+ 
  geom_boxplot(outlier.shape = NA) +  
  geom_jitter(aes(colour=tst[,1]), position = position_jitter(width = 0.2)) +
  scale_colour_gradient2(low = "blue", mid = "red",high = "black", midpoint = mean(tst[,1]), name="Sales")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), legend.key = element_rect(colour = "black"))


Next I will create a training and testing data set. The training set will use 70% of the observations and the testing set will use the remaining 30%. After this I will calculate the future predictions for each model.

tst <- data.frame(cbind(as.matrix(ts_all), as.matrix(cycle(ts_all))))
splitTrainXvat <- function(tser, perc_train){
  ntrain <- floor(length(as.vector(tser)) * perc_train)
  nval <- length(as.vector(tser)) - ntrain
  
  ttrain <- ts(as.vector(tser[1:ntrain]), start = start(tser), frequency = frequency(tser))
  tval <- ts(as.vector(tser[ntrain + 1:nval]), start = end(ttrain) + deltat(tser), 
            frequency = frequency(tser))
  
  stopifnot(length(ttrain) == ntrain)
  stopifnot(length(tval) == nval)
  
  list(ttrain, tval)
}
data <- splitTrainXvat(ts_all, 0.70)
ts_train <- data[[1]]
ts_val <- data[[2]]


Next I will create multiple models of different types to see which one minimizes the main error metric (RMSE).
The models are:

  1. Autoregressive model with Box-Cox transformation.

  2. Linear regression model with seasonal and trend features with Box-Cox transformation.

  3. Holt-Winters model with additive seasonal components and logarithmic transformation.

  4. Holt-Winters model with multiplicative seasonal components and logarithmic transformation.

  5. Seasonal autoregressive integrated moving average model.

  6. Seasonal autoregressive integrated moving average model with Box-Cox transformation.

  7. Exponential smoothing state space model with error, seasonality and trend features chosen programatically with Box-Cox transformation.

  8. Exponential smoothing state space model with Box-Cox transformation, ARMA errors, Trend and Seasonal components.

#see model parameters
mod.ar <- ar(ts_train, method = "yule-walker", lambda=0)
mod.reg <- tslm(ts_train ~ trend + season, lambda=0)
mod.hw.mul <- HoltWinters(log(ts_train), seasonal = "mul")
mod.hw.add <- HoltWinters(log(ts_train), seasonal = "add")
mod.arima <- auto.arima(ts_train, max.p=2, max.q=2,
                        max.P=2, max.Q=2, max.d=2, max.D=2, allowdrift = TRUE,
                        stepwise=FALSE, approximation=FALSE)
mod.arima.boxcox <- auto.arima(ts_train, max.p=2, max.q=2,
                            max.P=2, max.Q=2, max.d=2, max.D=2, allowdrift = TRUE,
                            stepwise=FALSE, approximation=FALSE, lambda=0)
mod.ets <- ets(ts_train, model="ZZZ", lambda=0)
mod.tbats <- tbats(ts_train, use.box.cox=TRUE)


Here I make the forecasts for the models for the testing set and an additional 20 forecasts to see how the models would predict beyond our data.

pred.ar <- forecast(mod.ar, h = length(ts_val)+20)$mean
pred.reg <- forecast(mod.reg, h = length(ts_val)+20)$mean
pred.hw.mul <- exp(forecast(mod.hw.mul, h = length(ts_val)+20)$mean)
pred.hw.add <- exp(forecast(mod.hw.add, h = length(ts_val)+20)$mean)
pred.arima <- forecast(mod.arima, h = length(ts_val)+20)$mean
pred.arima.boxcox <- forecast(mod.arima.boxcox, h = length(ts_val)+20)$mean
pred.ets <- forecast(mod.ets, h = length(ts_val)+20)$mean
pred.tbats <- forecast(mod.tbats, h = length(ts_val)+20)$mean


Next I will plot the data. The red line labeled “Base” is the time series of interest that the models try to predict.
Here are the AR and Regression models and ETS and TBATS models.

dat <- data.frame(Y=as.matrix(ts_all), date=as.Date(ts_all), model = "Base")
dat <- rbind(dat, 
             data.frame(Y=as.matrix(pred.ar), date=as.Date(pred.ar), model = "AR"),
             data.frame(Y=as.matrix(pred.reg), date=as.Date(pred.reg), model = "Reg"))

plot1 <- ggplot(dat, aes(date, Y, colour = model)) + 
  geom_line()+
  geom_vline(aes(xintercept = as.numeric(dat$date[length(ts_all)])), linetype = "longdash", color = "black")+
  ylab("")+
  xlab("")+
  ggtitle("Monthly anti-diabetic drug sales")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), legend.key = element_rect(colour = "black"))


dat <- data.frame(Y=as.matrix(ts_all), date=as.Date(ts_all), model = "Base")
dat <- rbind(dat, 
             data.frame(Y=as.matrix(pred.ets), date=as.Date(pred.ets), model = "ETS"),
             data.frame(Y=as.matrix(pred.tbats), date=as.Date(pred.tbats), model = "TBATS"))

plot2 <- ggplot(dat, aes(date, Y, colour = model)) + 
  geom_line()+
  geom_vline(aes(xintercept = as.numeric(dat$date[length(ts_all)])), linetype = "longdash", color = "black")+
  ylab("")+
  xlab("")+
  ggtitle("Monthly anti-diabetic drug sales")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), legend.key = element_rect(colour = "black"))

grid.arrange(plot1, plot2, ncol = 1)


Here are the Holt-Winters models and SARIMA models.

dat <- data.frame(Y=as.matrix(ts_all), date=as.Date(ts_all), model = "Base")
dat <- rbind(dat, 
             data.frame(Y=as.matrix(pred.hw.add), date=as.Date(pred.hw.add), model = "HW.Add"),
             data.frame(Y=as.matrix(pred.hw.mul), date=as.Date(pred.hw.mul), model = "HW.Mul"))

plot1 <- ggplot(dat, aes(date, Y, colour = model)) + 
  geom_line()+
  geom_vline(aes(xintercept = as.numeric(dat$date[length(ts_all)])), linetype = "longdash", color = "black")+
  ylab("")+
  xlab("")+
  ggtitle("Monthly anti-diabetic drug sales")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), legend.key = element_rect(colour = "black"))


dat <- data.frame(Y=as.matrix(ts_all), date=as.Date(ts_all), model = "Base")
dat <- rbind(dat, 
             data.frame(Y=as.matrix(pred.arima), date=as.Date(pred.arima), model = "SARIMA"),
             data.frame(Y=as.matrix(pred.arima.boxcox), date=as.Date(pred.arima.boxcox), model = "SARIMA.boxcox"))

plot2 <- ggplot(dat, aes(date, Y, colour = model)) + 
  geom_line()+
  geom_vline(aes(xintercept = as.numeric(dat$date[length(ts_all)])), linetype = "longdash", color = "black")+
  ylab("")+
  xlab("")+
  ggtitle("Monthly anti-diabetic drug sales")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), legend.key = element_rect(colour = "black"))

grid.arrange(plot1, plot2, ncol = 1)


As we can see below, the Regression model performs the best in the testing data when the forecasting horizion is 62 months measured by RMSE. However, when looking at the other error metrics, the SARIMA.boxcox model is actually performing better than the Regression model. These error metrics are computed from the testing set, so only for the observations up to the black dashed line in the plots. The observations after this black dashed line are the future forecasts of sales.

The error measures are:

  1. RMSE: Root Mean Squared Error.

  2. MAE: Mean Absolute Error.

  3. MAPE: Mean Absolute Percentage Error.

  4. Theil’s U: Uncertainty coefficient.

tst <- rbind(accuracy(pred.ar, ts_all),
accuracy(pred.hw.mul, ts_all),
accuracy(pred.hw.add, ts_all),
accuracy(pred.reg, ts_all),
accuracy(pred.arima, ts_all),
accuracy(pred.arima.boxcox, ts_all),
accuracy(pred.ets, ts_all),
accuracy(pred.tbats, ts_all))

row.names(tst) <- c("AR",
                    "HW.Mul",
                    "HW.Add",
                    "Reg",
                    "SARIMA",
                    "SARIMA.boxcox",
                    "ETS",
                    "TBATS")
tst <- data.frame(tst)
tst <- tst[,(names(tst) %in% c("RMSE", "MAE", "MAPE", "Theil.s.U"))]

round(tst[order(tst$RMSE),], 5)
##                   RMSE     MAE     MAPE Theil.s.U
## Reg            1.69344 1.31561  7.49949   0.58151
## SARIMA.boxcox  1.74761 1.27464  6.56154   0.55013
## HW.Mul         3.06205 2.35635 12.01514   0.96272
## ETS            3.26590 2.50299 12.53139   1.02069
## HW.Add         3.58628 2.82680 14.24099   1.11379
## SARIMA         3.78418 2.89971 14.27774   1.15756
## TBATS          7.34117 6.21721 31.81599   2.34436
## AR            10.43364 9.16299 47.63211   3.38017


Since it was not conclusive which model was best due to some metrics favoring the Regression model and some favoring the SARIMA.boxcox model I have decided to see if taking the mean forecast of those two models would be better than either of those models.

temp <- data.frame(Y = as.matrix(pred.arima.boxcox)+as.matrix(pred.reg))
temp$mean <- temp$Y/2

dat <- data.frame(Y=as.matrix(ts_all), date=as.Date(ts_all), model = "Base")
dat <- rbind(dat, 
             data.frame(Y=temp$mean, date=as.Date(pred.arima.boxcox), model = "Mean Model"))

ggplot(dat, aes(date, Y, colour = model)) + 
  geom_line()+
  geom_vline(aes(xintercept = as.numeric(dat$date[length(ts_all)])), linetype = "longdash", color = "black")+
  ylab("")+
  xlab("")+
  ggtitle("Monthly anti-diabetic drug sales")+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), legend.key = element_rect(colour = "black"))


We can see that all error metrics are better for this combination model!

getPerformance <- function(pred, val){
  res <- pred - val
  MAE <- sum(abs(res))/length(val)
  RSS <- sum(res^2)
  MSE <- RSS/length(val)
  RMSE <- sqrt(MSE)
  APE <- abs((val-pred)/val)
  MAPE <- mean(APE)*100
  M6 <- sum(((pred[2:length(pred)]-val[2:(length(val))])/val[1:(length(val)-1)])^2)
  N6 <- sum(((val[2:length(val)]-val[1:(length(val)-1)])/val[1:(length(val)-1)])^2)
  Theil.s.U <- sqrt(M6/N6)
  perf <- data.frame(RMSE, MAE, MAPE, Theil.s.U)
}

perf <- cbind(type = c("Mean Model"), getPerformance(as.vector(temp$mean[1:length(ts_val)]), 
                                                 as.vector(ts_val)))
perf
##         type     RMSE      MAE     MAPE Theil.s.U
## 1 Mean Model 1.348637 1.060761 5.775292 0.4404351