# install.packages("data.table")
library(data.table)
aussieWines = fread("http://www.forecastingbook.com/mooc/AustralianWines.csv?attredirects=0&d=1")
## Warning in fread("http://www.forecastingbook.com/mooc/AustralianWines.csv?
## attredirects=0&d=1"): Bumped column 4 to type character on data row 175,
## field contains '*'. Coercing previously read values in this column from
## logical, integer or numeric back to character which may not be lossless;
## e.g., if '00' and '000' occurred before they will now be just '0', and
## there may be inconsistencies with treatment of ',,' and ',NA,' too (if they
## occurred in this column before the bump). If this matters please rerun and
## set 'colClasses' to 'character' for this column. Please note that column
## type detection uses the first 5 rows, the middle 5 rows and the last
## 5 rows, so hopefully this message should be very rare. If reporting to
## datatable-help, please rerun and include the output from verbose=TRUE.
aussieWines = aussieWines[1:180,] # Fread gets confused at end of file and adds some NA's
summary(aussieWines)
## Month Fortified Red Rose
## Length:180 Min. :1154 Min. : 464 Length:180
## Class :character 1st Qu.:2377 1st Qu.:1123 Class :character
## Mode :character Median :2894 Median :1559 Mode :character
## Mean :2999 Mean :1630
## 3rd Qu.:3527 3rd Qu.:2106
## Max. :5618 Max. :3670
## sparkling Sweet white Dry white
## Min. :1170 Min. : 85.0 Min. :1954
## 1st Qu.:1605 1st Qu.:141.5 1st Qu.:2736
## Median :1896 Median :223.5 Median :3090
## Mean :2431 Mean :247.1 Mean :3240
## 3rd Qu.:2599 3rd Qu.:319.2 3rd Qu.:3685
## Max. :7242 Max. :662.0 Max. :5725
fortified.ts = ts(aussieWines$Fortified, start=c(1980,1), end = c(1994, 12), freq=12)
red.ts = ts(aussieWines$Red, start=c(1980,1), end = c(1994, 12), freq=12)
aussieWines$Rose = as.numeric(aussieWines$Rose) # make Rose a numeric variables and introduce NA's at "*"s
## Warning: NAs introduced by coercion
rose.ts = ts(aussieWines$Rose, start=c(1980,1), end = c(1994, 12), freq=12)
sparkling.ts = ts(aussieWines$sparkling, start=c(1980,1), end = c(1994, 12), freq=12)
sweetwine.ts = ts(aussieWines$`Sweet white`, start=c(1980,1), end = c(1994, 12), freq=12)
drywhite.ts = ts(aussieWines$`Dry white`, start=c(1980,1), end = c(1994, 12), freq=12)
# Impute by interpolation missing values in rose.ts
# install.packages("imputeTS")
library(imputeTS)
rose.ts = na.interpolation(rose.ts)
#install.packages("forecast")
library(forecast)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
##
## na.locf
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 7.1
par(mfrow = c(1, 3))
plot(fortified.ts, bty="l", main="Fortified Wine")
abline(reg=lm(fortified.ts~time(fortified.ts)))
plot(aggregate(fortified.ts,FUN=mean), main="Fortified Trend")
boxplot(fortified.ts~cycle(fortified.ts), xlab="Months", main="Seasonality of Fortified Wine")
plot(red.ts, bty="l", main= "Red Wine")
abline(reg=lm(red.ts~time(red.ts)))
plot(aggregate(red.ts,FUN=mean), main="Red Wine Trend")
boxplot(red.ts~cycle(red.ts), xlab="Months", main="Seasonality of Red Wine")
plot(rose.ts, bty="l", main = "Rose Wine")
abline(reg=lm(rose.ts~time(rose.ts)))
plot(aggregate(rose.ts,FUN=mean), main="Rose Wine Trend")
boxplot(rose.ts~cycle(rose.ts), xlab="Months", main="Seasonality of Rose Wine")
plot(sparkling.ts, bty="l", main = "Sparkling Wine")
abline(reg=lm(sparkling.ts~time(sparkling.ts)))
plot(aggregate(sparkling.ts,FUN=mean), main="Sparkling Wine Trend")
boxplot(sparkling.ts~cycle(sparkling.ts), xlab="Months", main="Seasonality of Sparkling Wine")
plot(sweetwine.ts, bty="l", main="Sweet Wine")
abline(reg=lm(sweetwine.ts~time(sweetwine.ts)))
plot(aggregate(sweetwine.ts,FUN=mean), main="Sweet Wine Trend")
boxplot(sweetwine.ts~cycle(sweetwine.ts), xlab="Months", main="Seasonality of Sweet Wine")
plot(drywhite.ts, bty="l", main = "Dry White Wines")
abline(reg=lm(drywhite.ts~time(drywhite.ts)))
plot(aggregate(drywhite.ts,FUN=mean), main="Dry White Wine Trend")
boxplot(drywhite.ts~cycle(drywhite.ts), xlab="Months", main="Seasonality of Dry White Wine")
library(forecast)
for(i in 1:3) {
fixed.nValid = 12 * i # Validation length
fixed.nTrain = length(rose.ts) - fixed.nValid
train.ts = window(rose.ts, start=c(1980,1), end=c(1980,fixed.nTrain))
valid.ts = window(rose.ts, start=c(1980, fixed.nTrain), end = c(1980, fixed.nTrain+fixed.nValid))
naive.pred = naive(train.ts, h = fixed.nValid)
snaive.pred = snaive(train.ts, h = fixed.nValid)
print(paste("Validation length:", fixed.nValid, "months."))
print(paste("Rose Naive Validation RMSE:", accuracy(naive.pred, valid.ts)[2,2]))
print(paste("Rose Seasonal Naive Validation RMSE:", accuracy(snaive.pred, valid.ts)[2,2]))
# print(accuracy(naive.pred, valid.ts))
# print(accuracy(snaive.pred, valid.ts))
}
## [1] "Validation length: 12 months."
## [1] "Rose Naive Validation RMSE: 31.6472162487682"
## [1] "Rose Seasonal Naive Validation RMSE: 7.3516186174404"
## [1] "Validation length: 24 months."
## [1] "Rose Naive Validation RMSE: 43.8028516744608"
## [1] "Rose Seasonal Naive Validation RMSE: 8.90822549565745"
## [1] "Validation length: 36 months."
## [1] "Rose Naive Validation RMSE: 56.4588236277741"
## [1] "Rose Seasonal Naive Validation RMSE: 20.5441409748634"
library(forecast)
fixed.nValid = 12
fixed.nTrain = length(rose.ts) - fixed.nValid
stepsAhead = 1
error = 0
percent.error = 0
for(j in fixed.nTrain:(fixed.nTrain + fixed.nValid - stepsAhead)) {
train.ts = window(rose.ts, start=c(1980,1), end=c(1980, j))
valid.ts = window(rose.ts, start=c(1980, j+stepsAhead), end = c(1980, j+stepsAhead))
# Predict naive forecast
naive.pred = naive(train.ts, h = stepsAhead)
#Calculate error vector
error[j - fixed.nTrain + 1] = valid.ts - naive.pred$mean[stepsAhead]
percent.error[j - fixed.nTrain + 1] = error[j - fixed.nTrain + 1] / valid.ts
# print(train.ts)
# print(valid.ts)
}
# mean(abs(error))
print(paste("Rolling Forward Rose RMSE:", (sqrt(mean(error^2)))))
## [1] "Rolling Forward Rose RMSE: 15.6693260154283"
# mean(abs(percent.error))
The best validation partition length for Rose Wine I tested was 12 months for a seasonal naive fixed forecast. Obviously this is just a proof of concept since I tried it on only one wine and I didn’t iterate all the possibilities for either the Fixed or Roll forward partitioning algorithms. These are the algorithms from the Shmueli Book “Practical Time Series Forecasting with R p. 64 & p. 66”.