********************************************************************

Get data

********************************************************************

# install.packages("data.table")
library(data.table)
aussieWines = fread("http://www.forecastingbook.com/mooc/AustralianWines.csv?attredirects=0&d=1")
## Warning in fread("http://www.forecastingbook.com/mooc/AustralianWines.csv?
## attredirects=0&d=1"): Bumped column 4 to type character on data row 175,
## field contains '*'. Coercing previously read values in this column from
## logical, integer or numeric back to character which may not be lossless;
## e.g., if '00' and '000' occurred before they will now be just '0', and
## there may be inconsistencies with treatment of ',,' and ',NA,' too (if they
## occurred in this column before the bump). If this matters please rerun and
## set 'colClasses' to 'character' for this column. Please note that column
## type detection uses the first 5 rows, the middle 5 rows and the last
## 5 rows, so hopefully this message should be very rare. If reporting to
## datatable-help, please rerun and include the output from verbose=TRUE.
aussieWines = aussieWines[1:180,]  # Fread gets confused at end of file and adds some NA's
summary(aussieWines)
##     Month             Fortified         Red           Rose          
##  Length:180         Min.   :1154   Min.   : 464   Length:180        
##  Class :character   1st Qu.:2377   1st Qu.:1123   Class :character  
##  Mode  :character   Median :2894   Median :1559   Mode  :character  
##                     Mean   :2999   Mean   :1630                     
##                     3rd Qu.:3527   3rd Qu.:2106                     
##                     Max.   :5618   Max.   :3670                     
##    sparkling     Sweet white      Dry white   
##  Min.   :1170   Min.   : 85.0   Min.   :1954  
##  1st Qu.:1605   1st Qu.:141.5   1st Qu.:2736  
##  Median :1896   Median :223.5   Median :3090  
##  Mean   :2431   Mean   :247.1   Mean   :3240  
##  3rd Qu.:2599   3rd Qu.:319.2   3rd Qu.:3685  
##  Max.   :7242   Max.   :662.0   Max.   :5725

********************************************************************

Create TS objects

********************************************************************

fortified.ts = ts(aussieWines$Fortified, start=c(1980,1), end = c(1994, 12), freq=12)
red.ts = ts(aussieWines$Red, start=c(1980,1), end = c(1994, 12), freq=12)

aussieWines$Rose = as.numeric(aussieWines$Rose)  # make Rose a numeric variables and introduce NA's at "*"s
## Warning: NAs introduced by coercion
rose.ts = ts(aussieWines$Rose, start=c(1980,1), end = c(1994, 12), freq=12)

sparkling.ts = ts(aussieWines$sparkling, start=c(1980,1), end = c(1994, 12), freq=12)
sweetwine.ts = ts(aussieWines$`Sweet white`, start=c(1980,1), end = c(1994, 12), freq=12)
drywhite.ts = ts(aussieWines$`Dry white`, start=c(1980,1), end = c(1994, 12), freq=12)

# Impute by interpolation missing values in rose.ts
# install.packages("imputeTS")
library(imputeTS)
rose.ts = na.interpolation(rose.ts)

********************************************************************

Plot Wines: Data, Trend, Seasonality

********************************************************************

#install.packages("forecast")
library(forecast)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
## 
##     na.locf
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 7.1
par(mfrow = c(1, 3))
plot(fortified.ts, bty="l", main="Fortified Wine")
abline(reg=lm(fortified.ts~time(fortified.ts)))
plot(aggregate(fortified.ts,FUN=mean), main="Fortified Trend")
boxplot(fortified.ts~cycle(fortified.ts), xlab="Months", main="Seasonality of Fortified Wine")

plot(red.ts, bty="l", main= "Red Wine")
abline(reg=lm(red.ts~time(red.ts)))
plot(aggregate(red.ts,FUN=mean), main="Red Wine Trend")
boxplot(red.ts~cycle(red.ts), xlab="Months", main="Seasonality of Red Wine")

plot(rose.ts, bty="l", main = "Rose Wine")
abline(reg=lm(rose.ts~time(rose.ts)))
plot(aggregate(rose.ts,FUN=mean), main="Rose Wine Trend")
boxplot(rose.ts~cycle(rose.ts), xlab="Months", main="Seasonality of Rose Wine")

plot(sparkling.ts, bty="l", main = "Sparkling Wine")
abline(reg=lm(sparkling.ts~time(sparkling.ts)))
plot(aggregate(sparkling.ts,FUN=mean), main="Sparkling Wine Trend")
boxplot(sparkling.ts~cycle(sparkling.ts), xlab="Months", main="Seasonality of Sparkling Wine")

plot(sweetwine.ts, bty="l", main="Sweet Wine")
abline(reg=lm(sweetwine.ts~time(sweetwine.ts)))
plot(aggregate(sweetwine.ts,FUN=mean), main="Sweet Wine Trend")
boxplot(sweetwine.ts~cycle(sweetwine.ts), xlab="Months", main="Seasonality of Sweet Wine")

plot(drywhite.ts, bty="l", main = "Dry White Wines")
abline(reg=lm(drywhite.ts~time(drywhite.ts)))
plot(aggregate(drywhite.ts,FUN=mean), main="Dry White Wine Trend")
boxplot(drywhite.ts~cycle(drywhite.ts), xlab="Months", main="Seasonality of Dry White Wine")

********************************************************************

Fixed Partitioning

********************************************************************

library(forecast)

for(i in 1:3) {
        fixed.nValid = 12 * i # Validation length
        fixed.nTrain = length(rose.ts) - fixed.nValid
        train.ts = window(rose.ts, start=c(1980,1), end=c(1980,fixed.nTrain)) 
        valid.ts = window(rose.ts, start=c(1980, fixed.nTrain), end = c(1980, fixed.nTrain+fixed.nValid))

        naive.pred = naive(train.ts, h = fixed.nValid)
        snaive.pred = snaive(train.ts, h = fixed.nValid)
        
        print(paste("Validation length:", fixed.nValid, "months."))
        print(paste("Rose Naive Validation RMSE:", accuracy(naive.pred, valid.ts)[2,2]))
        print(paste("Rose Seasonal Naive Validation RMSE:", accuracy(snaive.pred, valid.ts)[2,2]))
        
        # print(accuracy(naive.pred, valid.ts))
        # print(accuracy(snaive.pred, valid.ts))
}
## [1] "Validation length: 12 months."
## [1] "Rose Naive Validation RMSE: 31.6472162487682"
## [1] "Rose Seasonal Naive Validation RMSE: 7.3516186174404"
## [1] "Validation length: 24 months."
## [1] "Rose Naive Validation RMSE: 43.8028516744608"
## [1] "Rose Seasonal Naive Validation RMSE: 8.90822549565745"
## [1] "Validation length: 36 months."
## [1] "Rose Naive Validation RMSE: 56.4588236277741"
## [1] "Rose Seasonal Naive Validation RMSE: 20.5441409748634"

********************************************************************

Roll Forward Partitioning

********************************************************************

library(forecast)

fixed.nValid = 12
fixed.nTrain = length(rose.ts) - fixed.nValid
stepsAhead = 1

error = 0
percent.error = 0

for(j in fixed.nTrain:(fixed.nTrain + fixed.nValid - stepsAhead))  {
         train.ts = window(rose.ts, start=c(1980,1), end=c(1980, j)) 
         valid.ts = window(rose.ts, start=c(1980, j+stepsAhead), end = c(1980, j+stepsAhead)) 
        
         # Predict naive forecast
         naive.pred = naive(train.ts, h = stepsAhead)
         
         #Calculate error vector
         error[j - fixed.nTrain + 1] = valid.ts - naive.pred$mean[stepsAhead]
         percent.error[j - fixed.nTrain + 1] = error[j - fixed.nTrain + 1] / valid.ts
         #  print(train.ts)
         # print(valid.ts)
}

        # mean(abs(error))
        print(paste("Rolling Forward Rose RMSE:", (sqrt(mean(error^2)))))
## [1] "Rolling Forward Rose RMSE: 15.6693260154283"
        # mean(abs(percent.error))

********************************************************************

Conclusion

********************************************************************

The best validation partition length for Rose Wine I tested was 12 months for a seasonal naive fixed forecast. Obviously this is just a proof of concept since I tried it on only one wine and I didn’t iterate all the possibilities for either the Fixed or Roll forward partitioning algorithms. These are the algorithms from the Shmueli Book “Practical Time Series Forecasting with R p. 64 & p. 66”.