Retail Sales Forecast

This is a Retail sales dataset downloaded from Datamarket.com. I processed the data and performed the forecasting analysis.

library(rdatamarket)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# fetch the data from data market
dminit("404ca002d8964fd682c3eb39a09dace1")
data <- dmlist("http://datamarket.com/data/set/3oei/retailers-sales#!ds=3oei")
# lets explore the data(first 5 rows) 
head(data)

##     Month  Value
## 1 1994-08 182445
## 2 1994-09 175128
## 3 1994-10 178642
## 4 1994-11 184272
## 5 1994-12 221549
## 6 1995-01 158004

names(data)

## [1] "Month" "Value"

tsdata<-data$Value
# convert to time series format
tseriesdata<-ts(tsdata,start=c(2006,10),end=c(2013,10),frequency=12)
# plot time series
plot.ts(tseriesdata)

plot of chunk unnamed-chunk-1

Though it does not look like having a multiplicative trend , yet to check that we will go for log transformation . this will convert it to additive trend if multiplicative trend exists.

r newts<-log(tseriesdata) plot(newts)

plot of chunk unnamed-chunk-2

As expected, it does not have any multiplicative trend. So we will perform seasonal decomposition to handle additive trend, seasonlity , and any irregular component if present.

# Seasonal decompostion
fit <- stl(tseriesdata, s.window="period")
plot(fit)

plot of chunk unnamed-chunk-3

# Lets check the seasonality ,if present
monthplot(tseriesdata)

plot of chunk unnamed-chunk-3

library(forecast)

## Loading required package: timeDate
## This is forecast 5.5

seasonplot(tseriesdata)

plot of chunk unnamed-chunk-3

fit<-ets(tseriesdata)
plot(fit)

plot of chunk unnamed-chunk-3

acf(tseriesdata)

plot of chunk unnamed-chunk-3

pacf(tseriesdata)

plot of chunk unnamed-chunk-3

# simple exponential - models level
fits <- HoltWinters(tseriesdata, beta=FALSE, gamma=FALSE)
accuracy(fits)

##                ME  RMSE   MAE   MPE  MAPE   MASE     ACF1
## Training set 5252 24870 17948 2.163 7.195 0.9418 -0.02458

# double exponential - models level and trend
fitd <- HoltWinters(tseriesdata, gamma=FALSE)
accuracy(fitd)

##                 ME   RMSE   MAE   MPE  MAPE  MASE   ACF1
## Training set 84837 149074 94682 1.831 7.397 4.969 0.9491

# triple exponential - models level, trend, and seasonal components
fite <- HoltWinters(tseriesdata)
accuracy(fite)

##                  ME   RMSE    MAE   MPE  MAPE  MASE   ACF1
## Training set 130882 186904 136227 1.512 5.502 7.149 0.9632

forecast1<-forecast(fite,5)
plot(forecast1)

plot of chunk unnamed-chunk-3

forecast2<-forecast(fitd,5)
plot(forecast2)

plot of chunk unnamed-chunk-3

acf(forecast1$residuals,lag.max=20)

plot of chunk unnamed-chunk-3

acf(forecast2$residuals,lag.max=20)

plot of chunk unnamed-chunk-3

Box.test(forecast1$residuals, lag=20, type="Ljung-Box")

## 
##  Box-Ljung test
## 
## data:  forecast1$residuals
## X-squared = 15.48, df = 20, p-value = 0.7482

Box.test(forecast2$residuals, lag=20, type="Ljung-Box")

## 
##  Box-Ljung test
## 
## data:  forecast2$residuals
## X-squared = 23.24, df = 20, p-value = 0.2771

# needs smoothing
tsforecast<-HoltWinters(log(tseriesdata))
plot(tsforecast)

plot of chunk unnamed-chunk-3

tsforecast

## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = log(tseriesdata))
## 
## Smoothing parameters:
##  alpha: 0.3236
##  beta : 0
##  gamma: 0.7722
## 
## Coefficients:
##          [,1]
## a   12.706868
## b    0.004957
## s1   0.153936
## s2  -0.003740
## s3  -0.016749
## s4   0.089852
## s5   0.046179
## s6   0.137895
## s7  -0.001986
## s8  -0.041266
## s9   0.026064
## s10 -0.035710
## s11 -0.001709
## s12  0.043178

tsforecast$SSE

## [1] 0.5707

ftAsmooth<-forecast.HoltWinters(tsforecast,h=48)
plot.forecast(ftAsmooth)

plot of chunk unnamed-chunk-3

ftAsmooth

##          Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Nov 2013          12.87 12.75 12.98 12.69 13.04
## Dec 2013          12.71 12.59 12.83 12.53 12.90
## Jan 2014          12.70 12.58 12.83 12.51 12.90
## Feb 2014          12.82 12.69 12.95 12.62 13.02
## Mar 2014          12.78 12.64 12.91 12.57 12.98
## Apr 2014          12.87 12.73 13.01 12.66 13.09
## May 2014          12.74 12.60 12.88 12.52 12.96
## Jun 2014          12.71 12.56 12.85 12.48 12.93
## Jul 2014          12.78 12.62 12.93 12.54 13.01
## Aug 2014          12.72 12.56 12.88 12.48 12.96
## Sep 2014          12.76 12.60 12.92 12.51 13.01
## Oct 2014          12.81 12.64 12.98 12.56 13.06
## Nov 2014          12.93 12.73 13.12 12.63 13.22
## Dec 2014          12.77 12.58 12.97 12.47 13.07
## Jan 2015          12.76 12.57 12.96 12.46 13.07
## Feb 2015          12.88 12.67 13.08 12.57 13.19
## Mar 2015          12.84 12.63 13.04 12.52 13.15
## Apr 2015          12.93 12.73 13.14 12.61 13.25
## May 2015          12.80 12.59 13.01 12.48 13.12
## Jun 2015          12.76 12.55 12.98 12.44 13.09
## Jul 2015          12.84 12.62 13.06 12.50 13.17
## Aug 2015          12.78 12.56 13.00 12.44 13.12
## Sep 2015          12.82 12.60 13.04 12.48 13.16
## Oct 2015          12.87 12.64 13.10 12.52 13.22
## Nov 2015          12.98 12.74 13.23 12.61 13.36
## Dec 2015          12.83 12.58 13.08 12.45 13.21
## Jan 2016          12.82 12.57 13.08 12.44 13.21
## Feb 2016          12.94 12.68 13.19 12.55 13.32
## Mar 2016          12.90 12.64 13.15 12.50 13.29
## Apr 2016          12.99 12.73 13.25 12.60 13.39
## May 2016          12.86 12.60 13.12 12.46 13.26
## Jun 2016          12.82 12.56 13.09 12.42 13.23
## Jul 2016          12.90 12.63 13.16 12.49 13.31
## Aug 2016          12.84 12.57 13.11 12.43 13.25
## Sep 2016          12.88 12.61 13.15 12.46 13.30
## Oct 2016          12.93 12.65 13.20 12.51 13.35
## Nov 2016          13.04 12.75 13.34 12.60 13.49
## Dec 2016          12.89 12.60 13.18 12.44 13.34
## Jan 2017          12.88 12.59 13.18 12.43 13.34
## Feb 2017          13.00 12.70 13.29 12.54 13.45
## Mar 2017          12.96 12.66 13.26 12.50 13.42
## Apr 2017          13.05 12.75 13.36 12.59 13.52
## May 2017          12.92 12.61 13.22 12.45 13.38
## Jun 2017          12.88 12.58 13.19 12.41 13.35
## Jul 2017          12.96 12.65 13.27 12.48 13.43
## Aug 2017          12.90 12.59 13.21 12.42 13.38
## Sep 2017          12.94 12.62 13.25 12.46 13.42
## Oct 2017          12.99 12.67 13.30 12.51 13.47

Box.test(ftAsmooth$residuals, lag=20, type="Ljung-Box")

## 
##  Box-Ljung test
## 
## data:  ftAsmooth$residuals
## X-squared = 18.67, df = 20, p-value = 0.5434

plot.ts(ftAsmooth$residuals)

plot of chunk unnamed-chunk-3

plotForecastErrors <- function(forecasterrors)
  {
     # make a histogram of the forecast errors:
     mybinsize <- IQR(forecasterrors)/4
     mysd   <- sd(forecasterrors)
     mymin  <- min(forecasterrors) - mysd*5
     mymax  <- max(forecasterrors) + mysd*3
     # generate normally distributed data with mean 0 and standard deviation mysd
     mynorm <- rnorm(10000, mean=0, sd=mysd)
     mymin2 <- min(mynorm)
     mymax2 <- max(mynorm)
     if (mymin2 < mymin) { mymin <- mymin2 }
     if (mymax2 > mymax) { mymax <- mymax2 }
     # make a red histogram of the forecast errors, with the normally distributed data overlaid:
     mybins <- seq(mymin, mymax, mybinsize)
     hist(forecasterrors, col="red", freq=FALSE, breaks=mybins)
     # freq=FALSE ensures the area under the histogram = 1
     # generate normally distributed data with mean 0 and standard deviation mysd
     myhist <- hist(mynorm, plot=FALSE, breaks=mybins)
     # plot the normal curve as a blue line on top of the histogram of forecast errors:
     points(myhist$mids, myhist$density, type="l", col="blue", lwd=2)
  }
plotForecastErrors(ftAsmooth$residuals)

plot of chunk unnamed-chunk-3

From the time plot, it appears plausible that the forecast errors have constant variance over time. From the histogram of forecast errors, it seems plausible that the forecast errors are normally distributed with mean zero.

Thus,there is little evidence of autocorrelation at lags 1-20 for the forecast errors, and the forecast errors appear to be normally distributed with mean zero and constant variance over time. This suggests that Holt-Winters exponential smoothing provides an adequate predictive model of the log of sales, which probably cannot be improved upon. Furthermore, the assumptions upon which the prediction intervals were based are probably valid.

Retail Sales Forecast

Ashish Kumar

August 22, 2014