********************************************************************

Problem Description

********************************************************************

We are continuing our analysis of the monthly sales of Australian wines (data in AustralianWines.csv). The data are for six types of wines (red, rose, sweet white, dry white, sparkling, and fortified) for the period 1980-1994. The units are thousands of liters.

You are hired to obtain short-term forecasts (2-months ahead) for each of the six series, and this task will be repeated every month, as new data arrive.

Evaluate the performance of naive forecasts by examining the training performance and the validation performance

  1. Choose one or more wine types
  2. Compute naive forecasts for the training and validation periods
  3. Plot the actual and forecasted values for the training and validation periods
  4. Compute performance measures (separately) for the training and validation periods

Week 3. Continuing the problem:

  1. Create a histogram of the validation period forecast errors (in an earlier discussion you already generated naive forecasts for the validation period; use the corresponding errors to create the histogram). Examine the shape in terms of error magnitudes.
  2. Let’s set our desired certainty level to 80%. Compute the first and ninth deciles of the validation errors distribution.
  3. Combine the two deciles with the point forecast (your naive forecast) to obtain an 80% prediction interval

Week 4. Differencing Wine sales

  1. Use differencing to remove trend and/or seasonality from each series so that we can then use a moving average to forecast future sales.
  2. Post about the differencing operations that you applied to each of the six series, explaining why chose those operations. Including charts can help support your arguments.

********************************************************************

Load Libraries

********************************************************************

# install.packages("data.table")
library(data.table)
library(forecast)
library(knitr)  # for kable table formatting

********************************************************************

Get data

********************************************************************

aussieWines = fread("http://www.forecastingbook.com/mooc/AustralianWines.csv?attredirects=0&d=1")
aussieWines = data.frame(aussieWines[1:180,]) # Fread gets confused at end of file and adds some NA's
colnames(aussieWines)[5] = "Sparkling"  # Capitalize Sparking wine column name
summary(aussieWines)
##     Month             Fortified         Red           Rose          
##  Length:180         Min.   :1154   Min.   : 464   Length:180        
##  Class :character   1st Qu.:2377   1st Qu.:1123   Class :character  
##  Mode  :character   Median :2894   Median :1559   Mode  :character  
##                     Mean   :2999   Mean   :1630                     
##                     3rd Qu.:3527   3rd Qu.:2106                     
##                     Max.   :5618   Max.   :3670                     
##    Sparkling     Sweet.white      Dry.white   
##  Min.   :1170   Min.   : 85.0   Min.   :1954  
##  1st Qu.:1605   1st Qu.:141.5   1st Qu.:2736  
##  Median :1896   Median :223.5   Median :3090  
##  Mean   :2431   Mean   :247.1   Mean   :3240  
##  3rd Qu.:2599   3rd Qu.:319.2   3rd Qu.:3685  
##  Max.   :7242   Max.   :662.0   Max.   :5725

********************************************************************

Create TS objects

********************************************************************

fortified.ts = ts(aussieWines$Fortified, start=c(1980,1), end = c(1994, 12), freq=12)
red.ts = ts(aussieWines$Red, start=c(1980,1), end = c(1994, 12), freq=12)
sparkling.ts = ts(aussieWines$Sparkling, start=c(1980,1), end = c(1994, 12), freq=12)
sweetwine.ts = ts(aussieWines$Sweet.white, start=c(1980,1), end = c(1994, 12), freq=12)
drywhite.ts = ts(aussieWines$Dry.white, start=c(1980,1), end = c(1994, 12), freq=12)

aussieWines$Rose = as.numeric(aussieWines$Rose)  # make Rose a numeric variables and introduce NA's at "*"s
## Warning: NAs introduced by coercion
rose.ts = ts(aussieWines$Rose, start=c(1980,1), end = c(1994, 12), freq=12)

# Impute by interpolation missing values in rose.ts
# install.packages("imputeTS")
library(imputeTS)
rose.ts = na.interpolation(rose.ts)

********************************************************************

Generate Naive Forecasts for each of the six wines

********************************************************************

# library(forecast)

# Fortified Wine Naive FCs
naiveFortified.pred = naive(fortified.ts, h = 2)
naiveFortified.fc = naiveFortified.pred$mean
snaiveFortified.pred = snaive(fortified.ts, h = 2)  # Seasonal Naive
snaiveFortified.fc = snaiveFortified.pred$mean

# Red Wine Naive FCs
naiveRed.pred = naive(red.ts, h = 2)
naiveRed.fc = naiveRed.pred$mean
snaiveRed.pred = snaive(red.ts, h = 2)  # Seasonal Naive
snaiveRed.fc = snaiveRed.pred$mean

# Sparkling Wine Naive FCs
naiveSparkling.pred = naive(sparkling.ts, h = 2)
naiveSparkling.fc = naiveSparkling.pred$mean
snaiveSparkling.pred = snaive(sparkling.ts, h = 2)  # Seasonal Naive
snaiveSparkling.fc = snaiveSparkling.pred$mean  

# Sweet Wine Naive FCs
naiveSweet.pred = naive(sweetwine.ts, h = 2)
naiveSweet.fc = naiveSweet.pred$mean
snaiveSweet.pred = snaive(sweetwine.ts, h = 2)  # Seasonal Naive
snaiveSweet.fc = snaiveSweet.pred$mean 

# Dry White Wine Naive FCs
naiveDryWhite.pred = naive(drywhite.ts, h = 2)
naiveDryWhite.fc = naiveDryWhite.pred$mean
snaiveDryWhite.pred = snaive(drywhite.ts, h = 2)  # Seasonal Naive
snaiveDryWhite.fc = snaiveDryWhite.pred$mean

# Rose Wine Naive FCs       
naiveRose.pred = naive(rose.ts, h = 2)
naiveRose.fc = naiveRose.pred$mean  # 
snaiveRose.pred = snaive(rose.ts, h = 2)  # Seasonal Naive
snaiveRose.fc = snaiveRose.pred$mean

********************************************************************

Table of Naive Forecasts

********************************************************************

wineNaive.db = data.frame(Wines = colnames(aussieWines)[2:7], 
                          NaiveFC_Jan95 = c(naiveFortified.fc[1], naiveRed.fc[1], naiveRose.fc[1], naiveSparkling.fc[1], naiveSweet.fc[1], naiveDryWhite.fc[1]), 
                          NaiveFC_Feb95 = c(naiveFortified.fc[2], naiveRed.fc[2], naiveRose.fc[2], naiveSparkling.fc[2], naiveSweet.fc[2], naiveDryWhite.fc[2]),
                          Seasonal_NaiveFC_Jan95 = c(snaiveFortified.fc[1], snaiveRed.fc[1], snaiveRose.fc[1], snaiveSparkling.fc[1], snaiveSweet.fc[1], snaiveDryWhite.fc[1]), 
                          Seasonal_NaiveFC_Feb95 = c(snaiveFortified.fc[2], snaiveRed.fc[2], snaiveRose.fc[2], snaiveSparkling.fc[2], snaiveSweet.fc[2], snaiveDryWhite.fc[2])
                          )

kable(wineNaive.db, caption = "Naive Forecasts For Wines Sales")
Naive Forecasts For Wines Sales
Wines NaiveFC_Jan95 NaiveFC_Feb95 Seasonal_NaiveFC_Jan95 Seasonal_NaiveFC_Feb95
Fortified 2467 2467 1154 1568
Red 2684 2684 1041 1728
Rose 84 84 30 35
Sparkling 5999 5999 1197 1968
Sweet.white 394 394 150 280
Dry.white 5725 5725 2265 3685

********************************************************************

Plot Naive Forecasts

********************************************************************

par(mfrow = c(1, 2))

# Plot Fortified Wine naive FCs
plot(naiveFortified.pred, main = 'Fortified Wine Naive FC')
plot(snaiveFortified.pred, main = "Fortified Seasonal Naive FC")

# Plot Red Wine Naive FCs
plot(naiveRed.pred, main = "Red Wine Naive FC")
plot(snaiveRed.pred, main = "Seasonal Red Wine Naive FC")

# Plot Sparkling Wine Naive FCs
plot(naiveSparkling.pred, main = "Sparkling Wine Naive FC")
plot(snaiveSparkling.pred, main = "Seasonal Sparkling Wine Naive FC")

# Plot Sweet Wine Naive FCs
plot(naiveSweet.pred, main = "Sweet Wine Naive FC")
plot(snaiveSweet.pred, main = "Seasonal Sweet Wine Naive FC")

# Plot Dry White Wine Naive FCs
plot(naiveDryWhite.pred, main = "Dry White Wine Naive FC")
plot(snaiveDryWhite.pred, main = "Seasonal Dry White Wine Naive FC" )

# Plot Rose Wine Naive FCs
plot(naiveRose.pred, main = "Rose Wine Naive FC")
plot(snaiveRose.pred, main = "Seasonal Rose Wine Naive FC")

********************************************************************

Performance of naive forecasts for wine sales

********************************************************************

Evaluate the performance of naive forecasts by examining the training performance and the validation performance

1. Compute naive forecasts for the training periods and plot the actual and forecasted values for the training periods.
2. Compute performance measures (separately) for the training periods.

Partition into Training and Validatation Sets

fixed.nValid = 12 
fixed.nTrain = length(rose.ts) - fixed.nValid
        
# Partition into train and validation set
train.rose.ts = window(rose.ts, start=c(1980,13), end=c(1980,fixed.nTrain)) 
valid.rose.ts = window(rose.ts, start=c(1980, fixed.nTrain), end = c(1980, fixed.nTrain+fixed.nValid))

Compute performance measures on the validation set

# For training set performance create lagged ts
train.rose.lag2 = lag(train.rose.ts, 2)  # For naive forecast
train.rose.lag12 = lag(train.rose.ts, 12)  # For snaive forecast

# Calculate residuals
resids.train.rose.lag2 = train.rose.ts - train.rose.lag2
resids.train.rose.lag12 = train.rose.ts - train.rose.lag12

# Table for training set accuracy of Naive Rose Wine forecasts
kable(accuracy(train.rose.ts, train.rose.lag2), caption = "Accuracy Stats for Naive Rose Wine Forecast over Training Set")
Accuracy Stats for Naive Rose Wine Forecast over Training Set
ME RMSE MAE MPE MAPE ACF1 Theil’s U
Test set -0.8441558 38.52053 27.90909 -9.061346 32.72826 0.1036125 1.563916
# Table for training set accuracy of Seasonal Naive Rose Wine forecasts
kable(accuracy(train.rose.ts, train.rose.lag12), caption = "Accuracy Stats for Seasonal Naive Rose Wine Forecast over Training Set")
Accuracy Stats for Seasonal Naive Rose Wine Forecast over Training Set
ME RMSE MAE MPE MAPE ACF1 Theil’s U
Test set -8.236111 22.15664 15.43056 -11.69132 18.86774 0.2949936 0.8900648

Plot the naive Forecasts for the training period

# Plot the residuals for the training set naive forecasts
par(mfrow = c(1, 2))  # Plot side by side

# Naive and Seasonal Naive plots
plot(resids.train.rose.lag2, xlab = "Months", ylab="Residuals", main = "Error in Naive Forecast \n on the training set for Rose Wines" )
plot(resids.train.rose.lag12, xlab = "Months", ylab="Residuals", main = "Error in Seasonal Naive Forecast \n on the training set for Rose Wines" )

# Histograms of residuals
hist(resids.train.rose.lag2, main = "Error in Naive Forecast \n on the training set for Rose Wines", xlab="Residuals")
hist(resids.train.rose.lag12, main = "Error in Seasonal Naive Forecast \n on the training set for Rose Wines", xlab="Residuals")

********************************************************************

3. Compute naive forecasts for the validation period and plot the actual and forecasted values for the validation period.

4. Compute performance measures (separately) for the validation period.

********************************************************************

# For training set performance create lagged ts
valid.rose.lag2 = lag(valid.rose.ts, 2)  # For naive forecast
valid.rose.lag12 = window(rose.ts, start=c(1980, fixed.nTrain-11), end = c(1980, fixed.nTrain+fixed.nValid-12))  # For snaive forecast

# Calculate residuals
resids.valid.rose.lag2 = valid.rose.ts - valid.rose.lag2
resids.valid.rose.lag12 = as.vector(valid.rose.ts) - as.vector(valid.rose.lag12)
## Warning in as.vector(valid.rose.ts) - as.vector(valid.rose.lag12): longer
## object length is not a multiple of shorter object length
# Table for training set accuracy of Naive Rose Wine forecasts
kable(accuracy(valid.rose.ts, valid.rose.lag2), caption = "Accuracy Stats for Naive Rose Wine Forecast over Validation Set")
Accuracy Stats for Naive Rose Wine Forecast over Validation Set
ME RMSE MAE MPE MAPE ACF1 Theil’s U
Test set 3.636364 17.83539 11.81818 1.464656 24.49496 0.0397031 1.586492
# Table for training set accuracy of Seasonal Naive Rose Wine forecasts
# kable(accuracy(valid.rose.lag12, valid.rose.ts), caption = "Accuracy Stats for Seasonal Naive Rose Wine Forecast over Validation Set")
RMSE.valid.rose.snaive = sqrt(mean(resids.valid.rose.lag12^2))
print(paste("RSME for Seasonal Naive Rose on Validation Set:",round(RMSE.valid.rose.snaive, 2)))
## [1] "RSME for Seasonal Naive Rose on Validation Set: 20.38"

Plot the naive Forecasts for the Validation period

# Plot the residuals for the training set naive forecasts
par(mfrow = c(1, 2))  # Plot side by side

# Naive and Seasonal Naive plots
plot(resids.valid.rose.lag2, xlab = "Months", ylab="Residuals", main = "Error in Naive Forecast \n on the Validation set \n for Rose Wines" )
plot(resids.valid.rose.lag12, xlab = "Months", ylab="Residuals", main = "Error in Seasonal Naive Forecast \n on the Validation set \n for Rose Wines" )

# Histograms of residuals
hist(resids.valid.rose.lag2, main = "Error in Naive Forecast \n on the Validation set \n for Rose Wines", xlab="Residuals")
hist(resids.valid.rose.lag12, main = "Error in Seasonal Naive Forecast \n on the Validation set \n for Rose Wines", xlab="Residuals")

********************************************************************

5. Create a histogram of the validation period forecast errors (in an earlier discussion you already generated naive forecasts for the validation period; use the corresponding errors to create the histogram). Examine the shape in terms of error magnitudes.

6. Let’s set our desired certainty level to 80%. Compute the first and ninth deciles of the validation errors distribution.

7. Combine the two deciles with the point forecast (your naive forecast) to obtain an 80% prediction interval.

********************************************************************

For Naive Rose Forecast

fixed.nValid = 12 
fixed.nTrain = length(rose.ts) - fixed.nValid

rose.tmp = window(rose.ts, start=c(1980, fixed.nTrain-1), end = c(1980, fixed.nTrain+fixed.nValid-2))
rose.vec = as.vector(rose.tmp)
rose.fc = ts(rose.vec, start=c(1994, 1), end=c(1994,12), freq=12)

resids.rose = rose.fc - valid.rose.ts
q.resids.rose = quantile(resids.rose, probs = c(0.1, 0.9))
q10.rose = rose.fc + q.resids.rose[1]
q90.rose = rose.fc + q.resids.rose[2]

# Compute the first and ninth deciles of the validation errors distribution for the 2 period naive forecast
table = as.data.frame(c(q10.rose[1], q90.rose[2]), row.names=c("10% Decile", "90% Decile"))
colnames(table) = "Decile values"
kable(table, caption = "First and Ninth Interval of the Validation Errors Distribution on the two period Naive Forecast")
First and Ninth Interval of the Validation Errors Distribution on the two period Naive Forecast
Decile values
10% Decile 31.4
90% Decile 93.5
hist(resids.rose, main="Histogram of Residuals of Rose Forecast")

par(mfrow = c(1, 1))
plot(rose.fc, main = "Forecast of Rose Wine \nwith 10% and 90% Confidence Bands. \n Actuals in Blue")
lines(q10.rose, col="red")
lines(q90.rose, col="red")
lines(valid.rose.ts, lty=2, col="blue")

For Seasonal Naive Forecast

fixed.nValid = 12 
fixed.nTrain = length(rose.ts) - fixed.nValid

srose.tmp = window(rose.ts, start=c(1980, fixed.nTrain-11), end = c(1980, fixed.nTrain+fixed.nValid-12))
srose.vec = as.vector(srose.tmp)
srose.fc = ts(srose.vec, start=c(1994, 1), end=c(1994,12), freq=12)

resids.srose = srose.fc - valid.rose.ts
q.resids.srose = quantile(resids.srose, probs = c(0.1, 0.9))
q10.srose = srose.fc + q.resids.srose[1]
q90.srose = srose.fc + q.resids.srose[2]

# Compute the first and ninth deciles of the validation errors distribution for the 2 period naive forecast
stable = as.data.frame(c(q10.srose[1], q90.srose[2]), row.names=c("10% Decile", "90% Decile"))
colnames(stable) = "Decile Values"
kable(stable, caption = "First and Ninth Interval of the Validation Errors Distribution on the two period Seasonal Naive Forecast")
First and Ninth Interval of the Validation Errors Distribution on the two period Seasonal Naive Forecast
Decile Values
10% Decile 26.40000
90% Decile 49.83333
hist(resids.srose,  main="Histogram of Residuals of Seasonal Rose Forecast")

par(mfrow = c(1, 1))
plot(srose.fc,  main = "Seasonal Forecast of Rose Wine \n with 10% and 90% Confidence Bands \n Actuals in Blue")
lines(q10.srose, col="red")
lines(q90.srose, col="red")
lines(valid.rose.ts, lty=2, col="blue")

********************************************************************

Week 4. Differencing Wine sales

8. Use differencing to remove trend and/or seasonality from each series so that we can then use a moving average to forecast future sales.

9. Post about the differencing operations that you applied to each of the six series, explaining why chose those operations. Including charts can help support your arguments.

********************************************************************

Plot Wines: Data, Trend, Seasonality
#install.packages("forecast")
# library(forecast)

par(mfrow = c(1, 3))
plot(fortified.ts, bty="l", main="Fortified Wine")
abline(reg=lm(fortified.ts~time(fortified.ts)))
plot(aggregate(fortified.ts,FUN=mean), main="Fortified Trend")
boxplot(fortified.ts~cycle(fortified.ts), xlab="Months", main="Seasonality of Fortified Wine")

plot(red.ts, bty="l", main= "Red Wine")
abline(reg=lm(red.ts~time(red.ts)))
plot(aggregate(red.ts,FUN=mean), main="Red Wine Trend")
boxplot(red.ts~cycle(red.ts), xlab="Months", main="Seasonality of Red Wine")

plot(rose.ts, bty="l", main = "Rose Wine")
abline(reg=lm(rose.ts~time(rose.ts)))
plot(aggregate(rose.ts,FUN=mean), main="Rose Wine Trend")
boxplot(rose.ts~cycle(rose.ts), xlab="Months", main="Seasonality of Rose Wine")

plot(sparkling.ts, bty="l", main = "Sparkling Wine")
abline(reg=lm(sparkling.ts~time(sparkling.ts)))
plot(aggregate(sparkling.ts,FUN=mean), main="Sparkling Wine Trend")
boxplot(sparkling.ts~cycle(sparkling.ts), xlab="Months", main="Seasonality of Sparkling Wine")

plot(sweetwine.ts, bty="l", main="Sweet Wine")
abline(reg=lm(sweetwine.ts~time(sweetwine.ts)))
plot(aggregate(sweetwine.ts,FUN=mean), main="Sweet Wine Trend")
boxplot(sweetwine.ts~cycle(sweetwine.ts), xlab="Months", main="Seasonality of Sweet Wine")

plot(drywhite.ts, bty="l", main = "Dry White Wines")
abline(reg=lm(drywhite.ts~time(drywhite.ts)))
plot(aggregate(drywhite.ts,FUN=mean), main="Dry White Wine Trend")
boxplot(drywhite.ts~cycle(drywhite.ts), xlab="Months", main="Seasonality of Dry White Wine")

Clearly, there is both trend and seasonality in each of the wine’s time series. Since each of the wine time series have a trend component and a seasonal component we will have to do double differencing to create a stationary series.

Perform double differencing on each wine series
drywhite.dif1 = diff(drywhite.ts, lag=1)
drywhite.dif2 = diff(drywhite.dif1, lag=12)

par(mfrow = c(1, 2))
plot(drywhite.dif1, main = "Dry White Wine: 1st differencing")
plot(drywhite.dif2, main = "Dry White Wine: 2nd differencing")

Notice that the second differencing plot does not look stationary, but the variance seems to increase over time. I will take logs of the time series before trying to difference to remove this non-stationarity.

Take logs of the time series and then double difference.
# Fortified Wine
fortified.log = log(fortified.ts)
fortified.dif1 = diff(fortified.log, lag=1) 
fortified.dif2 = diff(fortified.dif1, lag=12)  # lag = 12 to remove seasonality

# test undifferencing algo
# fortified.recover = diffinv(diffinv(fortified.dif2, xi=fortified.dif1[1]), xi=fortified.log[1])
# exp(fortified.recover)

par(mfrow = c(1, 2))
plot(fortified.dif1, main = "Fortified Wine: 1st differencing")
plot(fortified.dif2, main = "Fortified Wine: 2nd differencing")

# Red wine
red.log = log(red.ts)
red.dif1 = diff(log(red.log), lag=1)
red.dif2 = diff(red.dif1, lag=12) # lag = 12 to remove seasonality

plot(red.dif1, main = "Red Wine: 1st differencing")
plot(red.dif2, main = "Red Wine: 2nd differencing")

# Sparkling Wine
sparkling.log = log(sparkling.ts)
sparkling.dif1 = diff(sparkling.log, lag=1)
sparkling.dif2 = diff(sparkling.dif1, lag=12) # lag = 12 to remove seasonality

plot(sparkling.dif1, main = "Sparkling Wine: 1st differencing")
plot(sparkling.dif2, main = "Sparkling Wine: 2nd differencing")

# Sweet Wine
sweetwine.log = log(sweetwine.ts)
sweetwine.dif1 = diff(sweetwine.log, lag=1)
sweetwine.dif2 = diff(sweetwine.dif1, lag=12) # lag = 12 to remove seasonality

plot(sweetwine.dif1, main = "Sweet Wine: 1st differencing")
plot(sweetwine.dif2, main = "Sweet Wine: 2nd differencing")

# Dry White
drywhite.log = log(drywhite.ts)
drywhite.dif1 = diff(drywhite.log, lag=1)
drywhite.dif2 = diff(drywhite.dif1, lag=12) # lag = 12 to remove seasonality

plot(drywhite.dif1, main = "Dry White Wine: 1st differencing")
plot(drywhite.dif2, main = "Dry White Wine: 2nd differencing")

# Rose Wine
rose.log = log(rose.ts)
rose.dif1 = diff(rose.log, lag=1)
rose.dif2 = diff(rose.dif1, lag=12) # lag = 12 to remove seasonality

plot(rose.dif1, main = "Rose Wine: 1st differencing")
plot(rose.dif2, main = "Rose Wine: 2nd differencing")