********************************************************************

Problem Description

********************************************************************

We are continuing our analysis of the monthly sales of Australian wines (data in AustralianWines.csv). The data are for six types of wines (red, rose, sweet white, dry white, sparkling, and fortified) for the period 1980-1994. The units are thousands of liters.

You are hired to obtain short-term forecasts (2-months ahead) for each of the six series, and this task will be repeated every month, as new data arrive.

Evaluate the performance of naive forecasts by examining the training performance and the validation performance

  1. Choose one or more wine types
  2. Compute naive forecasts for the training and validation periods
  3. Plot the actual and forecasted values for the training and validation periods
  4. Compute performance measures (separately) for the training and validation periods

Continuing the problem:

  1. Create a histogram of the validation period forecast errors (in an earlier discussion you already generated naive forecasts for the validation period; use the corresponding errors to create the histogram). Examine the shape in terms of error magnitudes.
  2. Let’s set our desired certainty level to 80%. Compute the first and ninth deciles of the validation errors distribution.
  3. Combine the two deciles with the point forecast (your naive forecast) to obtain an 80% prediction interval

********************************************************************

Load Libraries

********************************************************************

# install.packages("data.table")
library(data.table)
library(forecast)
library(knitr)  # for kable table formatting

********************************************************************

Get data

********************************************************************

aussieWines = fread("http://www.forecastingbook.com/mooc/AustralianWines.csv?attredirects=0&d=1")
aussieWines = data.frame(aussieWines[1:180,]) # Fread gets confused at end of file and adds some NA's
colnames(aussieWines)[5] = "Sparkling"  # Capitalize Sparking wine column name
summary(aussieWines)
##     Month             Fortified         Red           Rose          
##  Length:180         Min.   :1154   Min.   : 464   Length:180        
##  Class :character   1st Qu.:2377   1st Qu.:1123   Class :character  
##  Mode  :character   Median :2894   Median :1559   Mode  :character  
##                     Mean   :2999   Mean   :1630                     
##                     3rd Qu.:3527   3rd Qu.:2106                     
##                     Max.   :5618   Max.   :3670                     
##    Sparkling     Sweet.white      Dry.white   
##  Min.   :1170   Min.   : 85.0   Min.   :1954  
##  1st Qu.:1605   1st Qu.:141.5   1st Qu.:2736  
##  Median :1896   Median :223.5   Median :3090  
##  Mean   :2431   Mean   :247.1   Mean   :3240  
##  3rd Qu.:2599   3rd Qu.:319.2   3rd Qu.:3685  
##  Max.   :7242   Max.   :662.0   Max.   :5725

********************************************************************

Create TS objects

********************************************************************

fortified.ts = ts(aussieWines$Fortified, start=c(1980,1), end = c(1994, 12), freq=12)
red.ts = ts(aussieWines$Red, start=c(1980,1), end = c(1994, 12), freq=12)
sparkling.ts = ts(aussieWines$Sparkling, start=c(1980,1), end = c(1994, 12), freq=12)
sweetwine.ts = ts(aussieWines$Sweet.white, start=c(1980,1), end = c(1994, 12), freq=12)
drywhite.ts = ts(aussieWines$Dry.white, start=c(1980,1), end = c(1994, 12), freq=12)

aussieWines$Rose = as.numeric(aussieWines$Rose)  # make Rose a numeric variables and introduce NA's at "*"s
## Warning: NAs introduced by coercion
rose.ts = ts(aussieWines$Rose, start=c(1980,1), end = c(1994, 12), freq=12)

# Impute by interpolation missing values in rose.ts
# install.packages("imputeTS")
library(imputeTS)
rose.ts = na.interpolation(rose.ts)

********************************************************************

Plot Wines: Data, Trend, Seasonality

********************************************************************

#install.packages("forecast")
# library(forecast)

par(mfrow = c(1, 3))
plot(fortified.ts, bty="l", main="Fortified Wine")
abline(reg=lm(fortified.ts~time(fortified.ts)))
plot(aggregate(fortified.ts,FUN=mean), main="Fortified Trend")
boxplot(fortified.ts~cycle(fortified.ts), xlab="Months", main="Seasonality of Fortified Wine")

plot(red.ts, bty="l", main= "Red Wine")
abline(reg=lm(red.ts~time(red.ts)))
plot(aggregate(red.ts,FUN=mean), main="Red Wine Trend")
boxplot(red.ts~cycle(red.ts), xlab="Months", main="Seasonality of Red Wine")

plot(rose.ts, bty="l", main = "Rose Wine")
abline(reg=lm(rose.ts~time(rose.ts)))
plot(aggregate(rose.ts,FUN=mean), main="Rose Wine Trend")
boxplot(rose.ts~cycle(rose.ts), xlab="Months", main="Seasonality of Rose Wine")

plot(sparkling.ts, bty="l", main = "Sparkling Wine")
abline(reg=lm(sparkling.ts~time(sparkling.ts)))
plot(aggregate(sparkling.ts,FUN=mean), main="Sparkling Wine Trend")
boxplot(sparkling.ts~cycle(sparkling.ts), xlab="Months", main="Seasonality of Sparkling Wine")

plot(sweetwine.ts, bty="l", main="Sweet Wine")
abline(reg=lm(sweetwine.ts~time(sweetwine.ts)))
plot(aggregate(sweetwine.ts,FUN=mean), main="Sweet Wine Trend")
boxplot(sweetwine.ts~cycle(sweetwine.ts), xlab="Months", main="Seasonality of Sweet Wine")

plot(drywhite.ts, bty="l", main = "Dry White Wines")
abline(reg=lm(drywhite.ts~time(drywhite.ts)))
plot(aggregate(drywhite.ts,FUN=mean), main="Dry White Wine Trend")
boxplot(drywhite.ts~cycle(drywhite.ts), xlab="Months", main="Seasonality of Dry White Wine")

********************************************************************

Generate Naive Forecasts for each of the six wines

********************************************************************

# library(forecast)

# Fortified Wine Naive FCs
naiveFortified.pred = naive(fortified.ts, h = 2)
naiveFortified.fc = naiveFortified.pred$mean
snaiveFortified.pred = snaive(fortified.ts, h = 2)  # Seasonal Naive
snaiveFortified.fc = snaiveFortified.pred$mean

# Red Wine Naive FCs
naiveRed.pred = naive(red.ts, h = 2)
naiveRed.fc = naiveRed.pred$mean
snaiveRed.pred = snaive(red.ts, h = 2)  # Seasonal Naive
snaiveRed.fc = snaiveRed.pred$mean

# Sparkling Wine Naive FCs
naiveSparkling.pred = naive(sparkling.ts, h = 2)
naiveSparkling.fc = naiveSparkling.pred$mean
snaiveSparkling.pred = snaive(sparkling.ts, h = 2)  # Seasonal Naive
snaiveSparkling.fc = snaiveSparkling.pred$mean  

# Sweet Wine Naive FCs
naiveSweet.pred = naive(sweetwine.ts, h = 2)
naiveSweet.fc = naiveSweet.pred$mean
snaiveSweet.pred = snaive(sweetwine.ts, h = 2)  # Seasonal Naive
snaiveSweet.fc = snaiveSweet.pred$mean 

# Dry White Wine Naive FCs
naiveDryWhite.pred = naive(drywhite.ts, h = 2)
naiveDryWhite.fc = naiveDryWhite.pred$mean
snaiveDryWhite.pred = snaive(drywhite.ts, h = 2)  # Seasonal Naive
snaiveDryWhite.fc = snaiveDryWhite.pred$mean

# Rose Wine Naive FCs       
naiveRose.pred = naive(rose.ts, h = 2)
naiveRose.fc = naiveRose.pred$mean  # 
snaiveRose.pred = snaive(rose.ts, h = 2)  # Seasonal Naive
snaiveRose.fc = snaiveRose.pred$mean

********************************************************************

Table of Naive Forecasts

********************************************************************

wineNaive.db = data.frame(Wines = colnames(aussieWines)[2:7], 
                          NaiveFC_Jan95 = c(naiveFortified.fc[1], naiveRed.fc[1], naiveRose.fc[1], naiveSparkling.fc[1], naiveSweet.fc[1], naiveDryWhite.fc[1]), 
                          NaiveFC_Feb95 = c(naiveFortified.fc[2], naiveRed.fc[2], naiveRose.fc[2], naiveSparkling.fc[2], naiveSweet.fc[2], naiveDryWhite.fc[2]),
                          Seasonal_NaiveFC_Jan95 = c(snaiveFortified.fc[1], snaiveRed.fc[1], snaiveRose.fc[1], snaiveSparkling.fc[1], snaiveSweet.fc[1], snaiveDryWhite.fc[1]), 
                          Seasonal_NaiveFC_Feb95 = c(snaiveFortified.fc[2], snaiveRed.fc[2], snaiveRose.fc[2], snaiveSparkling.fc[2], snaiveSweet.fc[2], snaiveDryWhite.fc[2])
                          )

kable(wineNaive.db, caption = "Naive Forecasts For Wines Sales")
Naive Forecasts For Wines Sales
Wines NaiveFC_Jan95 NaiveFC_Feb95 Seasonal_NaiveFC_Jan95 Seasonal_NaiveFC_Feb95
Fortified 2467 2467 1154 1568
Red 2684 2684 1041 1728
Rose 84 84 30 35
Sparkling 5999 5999 1197 1968
Sweet.white 394 394 150 280
Dry.white 5725 5725 2265 3685

********************************************************************

Plot Naive Forecasts

********************************************************************

par(mfrow = c(1, 2))

# Plot Fortified Wine naive FCs
plot(naiveFortified.pred, main = 'Fortified Wine Naive FC')
plot(snaiveFortified.pred, main = "Fortified Seasonal Naive FC")

# Plot Red Wine Naive FCs
plot(naiveRed.pred, main = "Red Wine Naive FC")
plot(snaiveRed.pred, main = "Seasonal Red Wine Naive FC")

# Plot Sparkling Wine Naive FCs
plot(naiveSparkling.pred, main = "Sparkling Wine Naive FC")
plot(snaiveSparkling.pred, main = "Seasonal Sparkling Wine Naive FC")

# Plot Sweet Wine Naive FCs
plot(naiveSweet.pred, main = "Sweet Wine Naive FC")
plot(snaiveSweet.pred, main = "Seasonal Sweet Wine Naive FC")

# Plot Dry White Wine Naive FCs
plot(naiveDryWhite.pred, main = "Dry White Wine Naive FC")
plot(snaiveDryWhite.pred, main = "Seasonal Dry White Wine Naive FC" )

# Plot Rose Wine Naive FCs
plot(naiveRose.pred, main = "Rose Wine Naive FC")
plot(snaiveRose.pred, main = "Seasonal Rose Wine Naive FC")

********************************************************************

Performance of naive forecasts for wine sales

********************************************************************

Evaluate the performance of naive forecasts by examining the training performance and the validation performance

1. Compute naive forecasts for the training periods and plot the actual and forecasted values for the training periods.
2. Compute performance measures (separately) for the training periods.

Partition into Training and Validatation Sets

fixed.nValid = 12 
fixed.nTrain = length(rose.ts) - fixed.nValid
        
# Partition into train and validation set
train.rose.ts = window(rose.ts, start=c(1980,13), end=c(1980,fixed.nTrain)) 
valid.rose.ts = window(rose.ts, start=c(1980, fixed.nTrain), end = c(1980, fixed.nTrain+fixed.nValid))

Compute performance measures on the validation set

# For training set performance create lagged ts
train.rose.lag2 = lag(train.rose.ts, 2)  # For naive forecast
train.rose.lag12 = lag(train.rose.ts, 12)  # For snaive forecast

# Calculate residuals
resids.train.rose.lag2 = train.rose.ts - train.rose.lag2
resids.train.rose.lag12 = train.rose.ts - train.rose.lag12

# Table for training set accuracy of Naive Rose Wine forecasts
kable(accuracy(train.rose.ts, train.rose.lag2), caption = "Accuracy Stats for Naive Rose Wine Forecast over Training Set")
Accuracy Stats for Naive Rose Wine Forecast over Training Set
ME RMSE MAE MPE MAPE ACF1 Theil’s U
Test set -0.8441558 38.52053 27.90909 -9.061346 32.72826 0.1036125 1.563916
# Table for training set accuracy of Seasonal Naive Rose Wine forecasts
kable(accuracy(train.rose.ts, train.rose.lag12), caption = "Accuracy Stats for Seasonal Naive Rose Wine Forecast over Training Set")
Accuracy Stats for Seasonal Naive Rose Wine Forecast over Training Set
ME RMSE MAE MPE MAPE ACF1 Theil’s U
Test set -8.236111 22.15664 15.43056 -11.69132 18.86774 0.2949936 0.8900648

Plot the naive Forecasts for the training period

# Plot the residuals for the training set naive forecasts
par(mfrow = c(1, 2))  # Plot side by side

# Naive and Seasonal Naive plots
plot(resids.train.rose.lag2, xlab = "Months", ylab="Residuals", main = "Error in Naive Forecast \n on the training set for Rose Wines" )
plot(resids.train.rose.lag12, xlab = "Months", ylab="Residuals", main = "Error in Seasonal Naive Forecast \n on the training set for Rose Wines" )

# Histograms of residuals
hist(resids.train.rose.lag2, main = "Error in Naive Forecast \n on the training set for Rose Wines", xlab="Residuals")
hist(resids.train.rose.lag12, main = "Error in Seasonal Naive Forecast \n on the training set for Rose Wines", xlab="Residuals")

3. Compute naive forecasts for the validation period and plot the actual and forecasted values for the validation period.

4. Compute performance measures (separately) for the validation period.

Compute the performance measure for Rose Wine on the validation set

# For training set performance create lagged ts
valid.rose.lag2 = lag(valid.rose.ts, 2)  # For naive forecast
valid.rose.lag12 = window(rose.ts, start=c(1980, fixed.nTrain-11), end = c(1980, fixed.nTrain+fixed.nValid-12))  # For snaive forecast

# Calculate residuals
resids.valid.rose.lag2 = valid.rose.ts - valid.rose.lag2
resids.valid.rose.lag12 = as.vector(valid.rose.ts) - as.vector(valid.rose.lag12)
## Warning in as.vector(valid.rose.ts) - as.vector(valid.rose.lag12): longer
## object length is not a multiple of shorter object length
# Table for training set accuracy of Naive Rose Wine forecasts
kable(accuracy(valid.rose.ts, valid.rose.lag2), caption = "Accuracy Stats for Naive Rose Wine Forecast over Validation Set")
Accuracy Stats for Naive Rose Wine Forecast over Validation Set
ME RMSE MAE MPE MAPE ACF1 Theil’s U
Test set 3.636364 17.83539 11.81818 1.464656 24.49496 0.0397031 1.586492
# Table for training set accuracy of Seasonal Naive Rose Wine forecasts
# kable(accuracy(valid.rose.lag12, valid.rose.ts), caption = "Accuracy Stats for Seasonal Naive Rose Wine Forecast over Validation Set")
RMSE.valid.rose.snaive = sqrt(mean(resids.valid.rose.lag12^2))
print(paste("RSME for Seasonal Naive Rose on Validation Set:",round(RMSE.valid.rose.snaive, 2)))
## [1] "RSME for Seasonal Naive Rose on Validation Set: 20.38"

Plot the naive Forecasts for the Validation period

# Plot the residuals for the training set naive forecasts
par(mfrow = c(1, 2))  # Plot side by side

# Naive and Seasonal Naive plots
plot(resids.valid.rose.lag2, xlab = "Months", ylab="Residuals", main = "Error in Naive Forecast \n on the Validation set \n for Rose Wines" )
plot(resids.valid.rose.lag12, xlab = "Months", ylab="Residuals", main = "Error in Seasonal Naive Forecast \n on the Validation set \n for Rose Wines" )

# Histograms of residuals
hist(resids.valid.rose.lag2, main = "Error in Naive Forecast \n on the Validation set \n for Rose Wines", xlab="Residuals")
hist(resids.valid.rose.lag12, main = "Error in Seasonal Naive Forecast \n on the Validation set \n for Rose Wines", xlab="Residuals")

5. Create a histogram of the validation period forecast errors (in an earlier discussion you already generated naive forecasts for the validation period; use the corresponding errors to create the histogram). Examine the shape in terms of error magnitudes.

6. Let’s set our desired certainty level to 80%. Compute the first and ninth deciles of the validation errors distribution.

7. Combine the two deciles with the point forecast (your naive forecast) to obtain an 80% prediction interval.

For Naive Rose Forecast

fixed.nValid = 12 
fixed.nTrain = length(rose.ts) - fixed.nValid

rose.tmp = window(rose.ts, start=c(1980, fixed.nTrain-1), end = c(1980, fixed.nTrain+fixed.nValid-2))
rose.vec = as.vector(rose.tmp)
rose.fc = ts(rose.vec, start=c(1994, 1), end=c(1994,12), freq=12)

resids.rose = rose.fc - valid.rose.ts
q.resids.rose = quantile(resids.rose, probs = c(0.1, 0.9))
q10.rose = rose.fc + q.resids.rose[1]
q90.rose = rose.fc + q.resids.rose[2]

# Compute the first and ninth deciles of the validation errors distribution for the 2 period naive forecast
table = as.data.frame(c(q10.rose[1], q90.rose[2]), row.names=c("10% Decile", "90% Decile"))
colnames(table) = "Decile values"
kable(table, caption = "First and Ninth Interval of the Validation Errors Distribution on the two period Naive Forecast")
First and Ninth Interval of the Validation Errors Distribution on the two period Naive Forecast
Decile values
10% Decile 31.4
90% Decile 93.5
hist(resids.rose, main="Histogram of Residuals of Rose Forecast")

par(mfrow = c(1, 1))
plot(rose.fc, main = "Forecast of Rose Wine \nwith 10% and 90% Confidence Bands. \n Actuals in Blue")
lines(q10.rose, col="red")
lines(q90.rose, col="red")
lines(valid.rose.ts, lty=2, col="blue")

For Seasonal Naive Forecast

fixed.nValid = 12 
fixed.nTrain = length(rose.ts) - fixed.nValid

srose.tmp = window(rose.ts, start=c(1980, fixed.nTrain-11), end = c(1980, fixed.nTrain+fixed.nValid-12))
srose.vec = as.vector(srose.tmp)
srose.fc = ts(srose.vec, start=c(1994, 1), end=c(1994,12), freq=12)

resids.srose = srose.fc - valid.rose.ts
q.resids.srose = quantile(resids.srose, probs = c(0.1, 0.9))
q10.srose = srose.fc + q.resids.srose[1]
q90.srose = srose.fc + q.resids.srose[2]

# Compute the first and ninth deciles of the validation errors distribution for the 2 period naive forecast
stable = as.data.frame(c(q10.srose[1], q90.srose[2]), row.names=c("10% Decile", "90% Decile"))
colnames(stable) = "Decile Values"
kable(stable, caption = "First and Ninth Interval of the Validation Errors Distribution on the two period Seasonal Naive Forecast")
First and Ninth Interval of the Validation Errors Distribution on the two period Seasonal Naive Forecast
Decile Values
10% Decile 26.40000
90% Decile 49.83333
hist(resids.srose,  main="Histogram of Residuals of Seasonal Rose Forecast")

par(mfrow = c(1, 1))
plot(srose.fc,  main = "Seasonal Forecast of Rose Wine \n with 10% and 90% Confidence Bands \n Actuals in Blue")
lines(q10.srose, col="red")
lines(q90.srose, col="red")
lines(valid.rose.ts, lty=2, col="blue")