1

suppressPackageStartupMessages(library(tidyverse))
#library(tidyverse)
#library(forecast)
suppressPackageStartupMessages(library(forecast))
#SouvenirSales <- read_csv("C:/Users/JLowell/Desktop/class/week three/SouvenirSales.csv")
##SouvenirSales <- read_csv("~/Dropbox/blogdog/homework/hw/data/SouvenirSales.csv")

SouvenirSales <- read_csv("C:/Users/jake/Dropbox/blogdog/homework/hw/data/SouvenirSales.csv")
## Parsed with column specification:
## cols(
##   Date = col_character(),
##   Sales = col_double()
## )
glimpse((SouvenirSales))
## Observations: 84
## Variables: 2
## $ Date  <chr> "Jan-95", "Feb-95", "Mar-95", "Apr-95", "May-95", "Jun-9...
## $ Sales <dbl> 1664.81, 2397.53, 2840.71, 3547.29, 3752.96, 3714.74, 43...
options(scipen=9)
souvenir.ts <- ts(SouvenirSales[ , 2] , start= c(1995 , 1) , frequency = 12)
#autoplot(souvenir.ts)
componentsOfSouvenir <- decompose(souvenir.ts)
plot(componentsOfSouvenir)

ggseasonplot(souvenir.ts)

We can see a distinct seasonal pattern, with sales spikeing at the end of the year.

(a) Why was the data partitioned?

The data was partitioned so that the analyst could train a forecast model, and measure model performacne agaisnt “out of sample” observations

# Set the length of the validation period to twelve months (one year)
validLength <- 12

# Set the length of the training period to everything else
trainLength <- length(souvenir.ts) - validLength

# Partition the data into training and validation periods
souvenirTrain <- window(souvenir.ts, start=c(1995,1), end=c(1995, trainLength))
souvenirValid <- window(souvenir.ts, start=c(1995,trainLength+1), end=c(1995,trainLength+validLength))

(b) Why choose a 12 month validation period?

The objective of the analyst is to produce a 12 month forecast, so measuring model performance over a 12 month holdout period is most appropriate. The data has a seasonal cycle of 12 months, so it is nice to forcast one entire seasonal cycle.

(c) What is the naive forcast for the validation period?

# Use the seasonal naive forecast
snaiveForValid <- snaive(souvenirTrain, h=validLength)

# To see the point forecasts from the seasonal naive model
snaiveForValid$mean
##           Jan      Feb      Mar      Apr      May      Jun      Jul
## 2001  7615.03  9849.69 14558.40 11587.33  9332.56 13082.09 16732.78
##           Aug      Sep      Oct      Nov      Dec
## 2001 19888.61 23933.38 25391.35 36024.80 80721.71
autoplot(snaiveForValid)

(d) Compute RMSE and MAPE for naive forecast

accuracy(snaiveForValid , souvenirValid)
##                    ME     RMSE      MAE      MPE     MAPE     MASE
## Training set 3401.361 6467.818 3744.801 22.39270 25.64127 1.000000
## Test set     7828.278 9542.346 7828.278 27.27926 27.27926 2.090439
##                   ACF1 Theil's U
## Training set 0.4140974        NA
## Test set     0.2264895 0.7373759

The RMSE of for the naive forcast in the validation period was 9542.35, while the MAPE was 27.28%. Both of these statistics are higher (inferior), for the test set than for the training set.

(e) plot histogram of forecast errors that result from naive forecast (validation period). Also plot sales vs forecast for validation period. What is behavior of forecast?

hist(snaiveForValid$residuals , breaks = 30 , probability =  TRUE)
lines(density(snaiveForValid$residuals , na.rm = TRUE ))

# Plot the actual values from the validation period (2001)
plot(souvenirValid, bty="l", main = "Actual vs Predicted (naive) for 2001", xaxt="n", xlab="The Year 2001", yaxt="n", ylab="Shipments")

axis(1, at=seq(2001,2001.75,0.25), labels=c("Quarter 1", "Quarter 2", "Quarter 3", "Quarter 4"))
axis(2, las=2)

# Now add the forecasts and make the line red and dashed
lines(snaiveForValid$mean, col=2, lty=2)

# Add a legend
legend(2001,4925, c("Actual","Forecast"), col=1:2, lty=1:2)

The forecast has one extreme outlier, which was a larger than expected jump in sales for December 2001

Plot everything together. The following plot shows that the naive forecast has a nice directional fit, but fails to capture the increasing magnitude of the seasonality in the series.

autoplot(souvenir.ts , series =  "Data") + autolayer(fitted(snaiveForValid) , series = "Fitted") + autolayer(snaiveForValid$mean ,series = "Naive") + ggtitle("Actual vs Fitted with Naive Forecast")
## Warning: Removed 12 rows containing missing values (geom_path).

(f) How to forcast for 2002?

The best way to forecast for 2002 is to use all of the data (through 2001), and then forecast for the next 12 months.

# Use the seasonal naive forecast
snaiveFor2002 <- snaive(souvenir.ts, h = 12)

# To see the point forecasts from the seasonal naive model
snaiveForValid$mean
##           Jan      Feb      Mar      Apr      May      Jun      Jul
## 2001  7615.03  9849.69 14558.40 11587.33  9332.56 13082.09 16732.78
##           Aug      Sep      Oct      Nov      Dec
## 2001 19888.61 23933.38 25391.35 36024.80 80721.71
autoplot(snaiveForValid)  + ggtitle("2002 Forcast for naive method") +
  theme(plot.title = element_text(hjust = 0.5))

2

#ShampooSales <- read_csv("C:/Users/JLowell/Desktop/class/week one/ShampooSales.csv")
##ShampooSales <- read_csv("~/Dropbox/blogdog/homework/hw/data/ShampooSales.csv")
ShampooSales <- read_csv("C:/Users/jake/Dropbox/blogdog/homework/hw/data/ShampooSales.csv")
## Parsed with column specification:
## cols(
##   Month = col_character(),
##   `Shampoo Sales` = col_double()
## )
glimpse(ShampooSales)
## Observations: 36
## Variables: 2
## $ Month           <chr> "Jan-95", "Feb-95", "Mar-95", "Apr-95", "May-9...
## $ `Shampoo Sales` <dbl> 266.0, 145.9, 183.1, 119.3, 180.3, 168.5, 231....

If the goal is forecasting sales for future months, it would not hurt to try all of the suggested steps. A challenge with this dataset is that we only have 3 cycles of 12 months.

partition the data into training and validation periods

#convert data to time series
Shampoo.ts <- ts(ShampooSales[,2] ,start= c(1995,1), frequency = 12 )


# Set the length of the validation period to twelve months (one year)
validLength <- 12

# Set the length of the training period to everything else
trainLength <- length(Shampoo.ts) - validLength

# Partition the data into training and validation periods
shampooTrain <- window(Shampoo.ts, start=c(1995,1), end=c(1995, trainLength))
shampooValid <- window(Shampoo.ts, start=c(1995,trainLength+1), end=c(1995,trainLength+validLength))

examine time plots of the series and of model forecast for only the training period

In this case, we would be better served to examine time plots for our entire series, because 3 cycles of 12 months is 50% longer than 2 cycles of 12 months, and would allow us to gain a better visual handle on what is happening in the series.

#autoplot(Shampoo.ts)
componentsOfShampoo <- decompose(Shampoo.ts)
plot(componentsOfShampoo)

ggseasonplot(Shampoo.ts)

The series displays an upward trend over time. There does not appear to be a seasonal trend.

compute naive forecast

# Use the seasonal naive forecast
snaiveForValid <- snaive(shampooTrain, h=validLength)

# To see the point forecasts from the seasonal naive model
snaiveForValid$mean
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 1997 194.3 149.5 210.1 273.3 191.4 287.0 226.0 303.6 289.9 421.6 264.5
##        Dec
## 1997 342.3
autoplot(Shampoo.ts , series =  "Data") + autolayer(fitted(snaiveForValid) , series = "Fitted") + autolayer(snaiveForValid$mean ,series = "Naive") + ggtitle("Actual vs Fitted with Naive Forecast")
## Warning: Removed 12 rows containing missing values (geom_path).

MAPE and RMSE for training period and validation

accuracy(snaiveForValid , shampooValid)
##                     ME     RMSE      MAE      MPE     MAPE     MASE
## Training set  66.33333 121.9118  91.2500 19.00793 30.12280 1.000000
## Test set     215.75833 240.4731 215.7583 43.62046 43.62046 2.364475
##                    ACF1 Theil's U
## Training set -0.3328601        NA
## Test set     -0.6413139  1.824886

The MAPE and RMSE for both the fitted training and the test set confirm that the naive forcast is not very good. There errors are larger for the test set, which is to be expected.

Use full series to forcast into the future

# Use the seasonal naive forecast
snaiveFor1998 <- snaive(Shampoo.ts, h = 12)

# To see the point forecasts from the seasonal naive model
snaiveForValid$mean
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 1997 194.3 149.5 210.1 273.3 191.4 287.0 226.0 303.6 289.9 421.6 264.5
##        Dec
## 1997 342.3
autoplot(snaiveForValid)  + ggtitle("1998 Forcast for naive method") +
  theme(plot.title = element_text(hjust = 0.5))

The seasonal naive forcast still does not appear to be accounting for the upward trend as much as would be appropriate.

args(accuracy)
## function (f, ...) 
## NULL
accuracy(snaiveFor2002)
##                   ME     RMSE     MAE      MPE     MAPE MASE      ACF1
## Training set 4139.18 7073.656 4425.38 23.20713 25.91427    1 0.4004496