We are continuing our analysis of the monthly sales of Australian wines (data in AustralianWines.csv). The data are for six types of wines (red, rose, sweet white, dry white, sparkling, and fortified) for the period 1980-1994. The units are thousands of liters.
You are hired to obtain short-term forecasts (2-months ahead) for each of the six series, and this task will be repeated every month, as new data arrive.
Evaluate the performance of naive forecasts by examining the training performance and the validation performance
Continuing the problem:
# install.packages("data.table")
library(data.table)
library(forecast)
library(knitr) # for kable table formatting
aussieWines = fread("http://www.forecastingbook.com/mooc/AustralianWines.csv?attredirects=0&d=1")
aussieWines = data.frame(aussieWines[1:180,]) # Fread gets confused at end of file and adds some NA's
colnames(aussieWines)[5] = "Sparkling" # Capitalize Sparking wine column name
summary(aussieWines)
## Month Fortified Red Rose
## Length:180 Min. :1154 Min. : 464 Length:180
## Class :character 1st Qu.:2377 1st Qu.:1123 Class :character
## Mode :character Median :2894 Median :1559 Mode :character
## Mean :2999 Mean :1630
## 3rd Qu.:3527 3rd Qu.:2106
## Max. :5618 Max. :3670
## Sparkling Sweet.white Dry.white
## Min. :1170 Min. : 85.0 Min. :1954
## 1st Qu.:1605 1st Qu.:141.5 1st Qu.:2736
## Median :1896 Median :223.5 Median :3090
## Mean :2431 Mean :247.1 Mean :3240
## 3rd Qu.:2599 3rd Qu.:319.2 3rd Qu.:3685
## Max. :7242 Max. :662.0 Max. :5725
fortified.ts = ts(aussieWines$Fortified, start=c(1980,1), end = c(1994, 12), freq=12)
red.ts = ts(aussieWines$Red, start=c(1980,1), end = c(1994, 12), freq=12)
sparkling.ts = ts(aussieWines$Sparkling, start=c(1980,1), end = c(1994, 12), freq=12)
sweetwine.ts = ts(aussieWines$Sweet.white, start=c(1980,1), end = c(1994, 12), freq=12)
drywhite.ts = ts(aussieWines$Dry.white, start=c(1980,1), end = c(1994, 12), freq=12)
aussieWines$Rose = as.numeric(aussieWines$Rose) # make Rose a numeric variables and introduce NA's at "*"s
## Warning: NAs introduced by coercion
rose.ts = ts(aussieWines$Rose, start=c(1980,1), end = c(1994, 12), freq=12)
# Impute by interpolation missing values in rose.ts
# install.packages("imputeTS")
library(imputeTS)
rose.ts = na.interpolation(rose.ts)
#install.packages("forecast")
# library(forecast)
par(mfrow = c(1, 3))
plot(fortified.ts, bty="l", main="Fortified Wine")
abline(reg=lm(fortified.ts~time(fortified.ts)))
plot(aggregate(fortified.ts,FUN=mean), main="Fortified Trend")
boxplot(fortified.ts~cycle(fortified.ts), xlab="Months", main="Seasonality of Fortified Wine")
plot(red.ts, bty="l", main= "Red Wine")
abline(reg=lm(red.ts~time(red.ts)))
plot(aggregate(red.ts,FUN=mean), main="Red Wine Trend")
boxplot(red.ts~cycle(red.ts), xlab="Months", main="Seasonality of Red Wine")
plot(rose.ts, bty="l", main = "Rose Wine")
abline(reg=lm(rose.ts~time(rose.ts)))
plot(aggregate(rose.ts,FUN=mean), main="Rose Wine Trend")
boxplot(rose.ts~cycle(rose.ts), xlab="Months", main="Seasonality of Rose Wine")
plot(sparkling.ts, bty="l", main = "Sparkling Wine")
abline(reg=lm(sparkling.ts~time(sparkling.ts)))
plot(aggregate(sparkling.ts,FUN=mean), main="Sparkling Wine Trend")
boxplot(sparkling.ts~cycle(sparkling.ts), xlab="Months", main="Seasonality of Sparkling Wine")
plot(sweetwine.ts, bty="l", main="Sweet Wine")
abline(reg=lm(sweetwine.ts~time(sweetwine.ts)))
plot(aggregate(sweetwine.ts,FUN=mean), main="Sweet Wine Trend")
boxplot(sweetwine.ts~cycle(sweetwine.ts), xlab="Months", main="Seasonality of Sweet Wine")
plot(drywhite.ts, bty="l", main = "Dry White Wines")
abline(reg=lm(drywhite.ts~time(drywhite.ts)))
plot(aggregate(drywhite.ts,FUN=mean), main="Dry White Wine Trend")
boxplot(drywhite.ts~cycle(drywhite.ts), xlab="Months", main="Seasonality of Dry White Wine")
# library(forecast)
# Fortified Wine Naive FCs
naiveFortified.pred = naive(fortified.ts, h = 2)
naiveFortified.fc = naiveFortified.pred$mean
snaiveFortified.pred = snaive(fortified.ts, h = 2) # Seasonal Naive
snaiveFortified.fc = snaiveFortified.pred$mean
# Red Wine Naive FCs
naiveRed.pred = naive(red.ts, h = 2)
naiveRed.fc = naiveRed.pred$mean
snaiveRed.pred = snaive(red.ts, h = 2) # Seasonal Naive
snaiveRed.fc = snaiveRed.pred$mean
# Sparkling Wine Naive FCs
naiveSparkling.pred = naive(sparkling.ts, h = 2)
naiveSparkling.fc = naiveSparkling.pred$mean
snaiveSparkling.pred = snaive(sparkling.ts, h = 2) # Seasonal Naive
snaiveSparkling.fc = snaiveSparkling.pred$mean
# Sweet Wine Naive FCs
naiveSweet.pred = naive(sweetwine.ts, h = 2)
naiveSweet.fc = naiveSweet.pred$mean
snaiveSweet.pred = snaive(sweetwine.ts, h = 2) # Seasonal Naive
snaiveSweet.fc = snaiveSweet.pred$mean
# Dry White Wine Naive FCs
naiveDryWhite.pred = naive(drywhite.ts, h = 2)
naiveDryWhite.fc = naiveDryWhite.pred$mean
snaiveDryWhite.pred = snaive(drywhite.ts, h = 2) # Seasonal Naive
snaiveDryWhite.fc = snaiveDryWhite.pred$mean
# Rose Wine Naive FCs
naiveRose.pred = naive(rose.ts, h = 2)
naiveRose.fc = naiveRose.pred$mean #
snaiveRose.pred = snaive(rose.ts, h = 2) # Seasonal Naive
snaiveRose.fc = snaiveRose.pred$mean
wineNaive.db = data.frame(Wines = colnames(aussieWines)[2:7],
NaiveFC_Jan95 = c(naiveFortified.fc[1], naiveRed.fc[1], naiveRose.fc[1], naiveSparkling.fc[1], naiveSweet.fc[1], naiveDryWhite.fc[1]),
NaiveFC_Feb95 = c(naiveFortified.fc[2], naiveRed.fc[2], naiveRose.fc[2], naiveSparkling.fc[2], naiveSweet.fc[2], naiveDryWhite.fc[2]),
Seasonal_NaiveFC_Jan95 = c(snaiveFortified.fc[1], snaiveRed.fc[1], snaiveRose.fc[1], snaiveSparkling.fc[1], snaiveSweet.fc[1], snaiveDryWhite.fc[1]),
Seasonal_NaiveFC_Feb95 = c(snaiveFortified.fc[2], snaiveRed.fc[2], snaiveRose.fc[2], snaiveSparkling.fc[2], snaiveSweet.fc[2], snaiveDryWhite.fc[2])
)
kable(wineNaive.db, caption = "Naive Forecasts For Wines Sales")
Wines | NaiveFC_Jan95 | NaiveFC_Feb95 | Seasonal_NaiveFC_Jan95 | Seasonal_NaiveFC_Feb95 |
---|---|---|---|---|
Fortified | 2467 | 2467 | 1154 | 1568 |
Red | 2684 | 2684 | 1041 | 1728 |
Rose | 84 | 84 | 30 | 35 |
Sparkling | 5999 | 5999 | 1197 | 1968 |
Sweet.white | 394 | 394 | 150 | 280 |
Dry.white | 5725 | 5725 | 2265 | 3685 |
par(mfrow = c(1, 2))
# Plot Fortified Wine naive FCs
plot(naiveFortified.pred, main = 'Fortified Wine Naive FC')
plot(snaiveFortified.pred, main = "Fortified Seasonal Naive FC")
# Plot Red Wine Naive FCs
plot(naiveRed.pred, main = "Red Wine Naive FC")
plot(snaiveRed.pred, main = "Seasonal Red Wine Naive FC")
# Plot Sparkling Wine Naive FCs
plot(naiveSparkling.pred, main = "Sparkling Wine Naive FC")
plot(snaiveSparkling.pred, main = "Seasonal Sparkling Wine Naive FC")
# Plot Sweet Wine Naive FCs
plot(naiveSweet.pred, main = "Sweet Wine Naive FC")
plot(snaiveSweet.pred, main = "Seasonal Sweet Wine Naive FC")
# Plot Dry White Wine Naive FCs
plot(naiveDryWhite.pred, main = "Dry White Wine Naive FC")
plot(snaiveDryWhite.pred, main = "Seasonal Dry White Wine Naive FC" )
# Plot Rose Wine Naive FCs
plot(naiveRose.pred, main = "Rose Wine Naive FC")
plot(snaiveRose.pred, main = "Seasonal Rose Wine Naive FC")
Partition into Training and Validatation Sets
fixed.nValid = 12
fixed.nTrain = length(rose.ts) - fixed.nValid
# Partition into train and validation set
train.rose.ts = window(rose.ts, start=c(1980,13), end=c(1980,fixed.nTrain))
valid.rose.ts = window(rose.ts, start=c(1980, fixed.nTrain), end = c(1980, fixed.nTrain+fixed.nValid))
Compute performance measures on the validation set
# For training set performance create lagged ts
train.rose.lag2 = lag(train.rose.ts, 2) # For naive forecast
train.rose.lag12 = lag(train.rose.ts, 12) # For snaive forecast
# Calculate residuals
resids.train.rose.lag2 = train.rose.ts - train.rose.lag2
resids.train.rose.lag12 = train.rose.ts - train.rose.lag12
# Table for training set accuracy of Naive Rose Wine forecasts
kable(accuracy(train.rose.ts, train.rose.lag2), caption = "Accuracy Stats for Naive Rose Wine Forecast over Training Set")
ME | RMSE | MAE | MPE | MAPE | ACF1 | Theil’s U | |
---|---|---|---|---|---|---|---|
Test set | -0.8441558 | 38.52053 | 27.90909 | -9.061346 | 32.72826 | 0.1036125 | 1.563916 |
# Table for training set accuracy of Seasonal Naive Rose Wine forecasts
kable(accuracy(train.rose.ts, train.rose.lag12), caption = "Accuracy Stats for Seasonal Naive Rose Wine Forecast over Training Set")
ME | RMSE | MAE | MPE | MAPE | ACF1 | Theil’s U | |
---|---|---|---|---|---|---|---|
Test set | -8.236111 | 22.15664 | 15.43056 | -11.69132 | 18.86774 | 0.2949936 | 0.8900648 |
Plot the naive Forecasts for the training period
# Plot the residuals for the training set naive forecasts
par(mfrow = c(1, 2)) # Plot side by side
# Naive and Seasonal Naive plots
plot(resids.train.rose.lag2, xlab = "Months", ylab="Residuals", main = "Error in Naive Forecast \n on the training set for Rose Wines" )
plot(resids.train.rose.lag12, xlab = "Months", ylab="Residuals", main = "Error in Seasonal Naive Forecast \n on the training set for Rose Wines" )
# Histograms of residuals
hist(resids.train.rose.lag2, main = "Error in Naive Forecast \n on the training set for Rose Wines", xlab="Residuals")
hist(resids.train.rose.lag12, main = "Error in Seasonal Naive Forecast \n on the training set for Rose Wines", xlab="Residuals")
Compute the performance measure for Rose Wine on the validation set
# For training set performance create lagged ts
valid.rose.lag2 = lag(valid.rose.ts, 2) # For naive forecast
valid.rose.lag12 = window(rose.ts, start=c(1980, fixed.nTrain-11), end = c(1980, fixed.nTrain+fixed.nValid-12)) # For snaive forecast
# Calculate residuals
resids.valid.rose.lag2 = valid.rose.ts - valid.rose.lag2
resids.valid.rose.lag12 = as.vector(valid.rose.ts) - as.vector(valid.rose.lag12)
## Warning in as.vector(valid.rose.ts) - as.vector(valid.rose.lag12): longer
## object length is not a multiple of shorter object length
# Table for training set accuracy of Naive Rose Wine forecasts
kable(accuracy(valid.rose.ts, valid.rose.lag2), caption = "Accuracy Stats for Naive Rose Wine Forecast over Validation Set")
ME | RMSE | MAE | MPE | MAPE | ACF1 | Theil’s U | |
---|---|---|---|---|---|---|---|
Test set | 3.636364 | 17.83539 | 11.81818 | 1.464656 | 24.49496 | 0.0397031 | 1.586492 |
# Table for training set accuracy of Seasonal Naive Rose Wine forecasts
# kable(accuracy(valid.rose.lag12, valid.rose.ts), caption = "Accuracy Stats for Seasonal Naive Rose Wine Forecast over Validation Set")
RMSE.valid.rose.snaive = sqrt(mean(resids.valid.rose.lag12^2))
print(paste("RSME for Seasonal Naive Rose on Validation Set:",round(RMSE.valid.rose.snaive, 2)))
## [1] "RSME for Seasonal Naive Rose on Validation Set: 20.38"
Plot the naive Forecasts for the Validation period
# Plot the residuals for the training set naive forecasts
par(mfrow = c(1, 2)) # Plot side by side
# Naive and Seasonal Naive plots
plot(resids.valid.rose.lag2, xlab = "Months", ylab="Residuals", main = "Error in Naive Forecast \n on the Validation set \n for Rose Wines" )
plot(resids.valid.rose.lag12, xlab = "Months", ylab="Residuals", main = "Error in Seasonal Naive Forecast \n on the Validation set \n for Rose Wines" )
# Histograms of residuals
hist(resids.valid.rose.lag2, main = "Error in Naive Forecast \n on the Validation set \n for Rose Wines", xlab="Residuals")
hist(resids.valid.rose.lag12, main = "Error in Seasonal Naive Forecast \n on the Validation set \n for Rose Wines", xlab="Residuals")
For Naive Rose Forecast
fixed.nValid = 12
fixed.nTrain = length(rose.ts) - fixed.nValid
rose.tmp = window(rose.ts, start=c(1980, fixed.nTrain-1), end = c(1980, fixed.nTrain+fixed.nValid-2))
rose.vec = as.vector(rose.tmp)
rose.fc = ts(rose.vec, start=c(1994, 1), end=c(1994,12), freq=12)
resids.rose = rose.fc - valid.rose.ts
q.resids.rose = quantile(resids.rose, probs = c(0.1, 0.9))
q10.rose = rose.fc + q.resids.rose[1]
q90.rose = rose.fc + q.resids.rose[2]
# Compute the first and ninth deciles of the validation errors distribution for the 2 period naive forecast
table = as.data.frame(c(q10.rose[1], q90.rose[2]), row.names=c("10% Decile", "90% Decile"))
colnames(table) = "Decile values"
kable(table, caption = "First and Ninth Interval of the Validation Errors Distribution on the two period Naive Forecast")
Decile values | |
---|---|
10% Decile | 31.4 |
90% Decile | 93.5 |
hist(resids.rose, main="Histogram of Residuals of Rose Forecast")
par(mfrow = c(1, 1))
plot(rose.fc, main = "Forecast of Rose Wine \nwith 10% and 90% Confidence Bands. \n Actuals in Blue")
lines(q10.rose, col="red")
lines(q90.rose, col="red")
lines(valid.rose.ts, lty=2, col="blue")
For Seasonal Naive Forecast
fixed.nValid = 12
fixed.nTrain = length(rose.ts) - fixed.nValid
srose.tmp = window(rose.ts, start=c(1980, fixed.nTrain-11), end = c(1980, fixed.nTrain+fixed.nValid-12))
srose.vec = as.vector(srose.tmp)
srose.fc = ts(srose.vec, start=c(1994, 1), end=c(1994,12), freq=12)
resids.srose = srose.fc - valid.rose.ts
q.resids.srose = quantile(resids.srose, probs = c(0.1, 0.9))
q10.srose = srose.fc + q.resids.srose[1]
q90.srose = srose.fc + q.resids.srose[2]
# Compute the first and ninth deciles of the validation errors distribution for the 2 period naive forecast
stable = as.data.frame(c(q10.srose[1], q90.srose[2]), row.names=c("10% Decile", "90% Decile"))
colnames(stable) = "Decile Values"
kable(stable, caption = "First and Ninth Interval of the Validation Errors Distribution on the two period Seasonal Naive Forecast")
Decile Values | |
---|---|
10% Decile | 26.40000 |
90% Decile | 49.83333 |
hist(resids.srose, main="Histogram of Residuals of Seasonal Rose Forecast")
par(mfrow = c(1, 1))
plot(srose.fc, main = "Seasonal Forecast of Rose Wine \n with 10% and 90% Confidence Bands \n Actuals in Blue")
lines(q10.srose, col="red")
lines(q90.srose, col="red")
lines(valid.rose.ts, lty=2, col="blue")