setwd("/Users/Chris Iyer/Documents/Assignment5Predictive/")
canadian <- read.csv("CanadianWorkHours.csv", header = TRUE, stringsAsFactors = FALSE)
#head(canadian)
#convert to a time series
canadian.ts <- ts(canadian$Hours.per.week, start=c(1966, 1), frequency=1)
length(canadian.ts)
## [1] 35
validLength <- 5
trainLength <- length(canadian.ts) - validLength
windowTrain <- window(canadian.ts, start = c(1966,1), end = c(1966, trainLength))
windowValid <- window(canadian.ts, start = c(1966, trainLength + 1), end = c(1966, trainLength + validLength))
Which one model of the following regression-based models would fit the series best?
In order to determine which regression model best fits the series, we first plot the time series, run each model, and pot the fitted models.
. Linear trend model: This linear trend model does not capture the complexity of this entire data series. The model expresses the downward trend but it misses the upward trend that starts in the late 1980s.
linearTrain <- tslm(canadian.ts ~ trend)
summary(linearTrain)
##
## Call:
## tslm(formula = canadian.ts ~ trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.20457 -0.28761 0.04779 0.30210 1.23190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.416134 0.175818 212.811 < 2e-16 ***
## trend -0.061373 0.008518 -7.205 2.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.509 on 33 degrees of freedom
## Multiple R-squared: 0.6113, Adjusted R-squared: 0.5996
## F-statistic: 51.91 on 1 and 33 DF, p-value: 2.928e-08
. Linear trend model with seasonality: This trend model does not fit the data. The data is yearly and there is no seasonal component. It can not be broken into seasonal cycles within any time frame.
. Quadratic trend model: This model does fit the Canadian manufacturing work hours data better than the linear model anothough both have a roughly equal R-sq. However, the quadratic trend model is able to capture the downward trend and then the upward trend which starts to emerge in the late 1980s.
# Quadratic trend - there are two ways to do this
canadaQuad1 <- tslm(canadian.ts ~ trend + I(trend^2))
summary(canadaQuad1)
##
## Call:
## tslm(formula = canadian.ts ~ trend + I(trend^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.94503 -0.20964 -0.01652 0.31862 0.60160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.1644156 0.2176599 175.340 < 2e-16 ***
## trend -0.1827154 0.0278786 -6.554 2.21e-07 ***
## I(trend^2) 0.0033706 0.0007512 4.487 8.76e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4049 on 32 degrees of freedom
## Multiple R-squared: 0.7614, Adjusted R-squared: 0.7465
## F-statistic: 51.07 on 2 and 32 DF, p-value: 1.1e-10
plot(c(1966, 2000), yrange, type="n", xlab="Year", ylab="canadian Revenue Passenger Miles (millions)", bty="l", xaxt="n", yaxt="n")
# Add the x-axis
axis(1, at=seq(1966,2000,1), labels=format(seq(1966,2000,1)))
# Add the y-axis
axis(2, at=seq(30,40, 1), labels=format(seq(30,40, 1)), las=2)
# Add the time series canadian
lines(canadian.ts, bty="l")
lines(linearTrain$fitted.values, col = "green", lwd = 2)
lines(canadaQuad1$fitted.values, col = "blue", lwd = 2)
# Add a legend
legend(1986,37, c("Actual", "Linear, R-sq = 0.0.60", "Quad, R-sq = 0.75"), lty=c(1,1, 1), col=c("black", "green", "blue"), bty="n", lwd = c(1,2,2))
. Quadratic trend model with seasonality:
Again, this model does not fit the data because it does not have a seasonal component.
4. Forecasting Department Store Sales: The time series plot shown in Figure 6.12 describes actual quarterly sales for a department store over a 6-year period. The data is available in DepartmentStoreSales.xls.
setwd("/Users/Chris Iyer/Documents/Assignment5Predictive/")
DeptSales <- read.csv("DeptStoreSales.csv")
#read in Dept Store Sales
DeptSales <- read.csv("DeptStoreSales.csv", header = TRUE, stringsAsFactors = FALSE)
DeptSales$Yr_Qtr <- c("Year 1 Q-1", "Year 1 Q-2", "Year 1 Q-3", "Year 1 Q-4", "Year 2 Q-1", "Year 2 Q-2", "Year 2 Q-3", "Year 2 Q-4", "Year 3 Q-1", "Year 3 Q-2", "Year 3 Q-3", "Year 3 Q-4", "Year 4 Q-1", "Year 4 Q-2", "Year 4 Q-3", "Year 4 Q-4", "Year 5 Q-5", "Year 5 Q-2", "Year 5 Q-3", "Year 5 Q-4", "Year 6 Q-1", "Year 6 Q-2", "Year 6 Q-3", "Year 6 Q-4")
DeptSales <- DeptSales %>% select(Yr_Qtr, Sales)
DeptSales.ts <- ts(DeptSales$Sales, start = c(1,1), frequency = 4)
library(Hmisc)
yrange <- range(DeptSales.ts)
# Set up the plot
plot(c(1, 7), yrange, type="n", xlab="Year", ylab="Dept Store Sales (thousands)", bty="l", xaxt="n", yaxt="n", main = "Quarterly Dept Store Sales ")
# Add the time series canadian
lines(DeptSales.ts, type = "o")
# Add the x-axis
axis(1, at=seq(1,7,1), labels=format(seq(1,7,1)), minor.tick(nx = 4))
# Add the y-axis
axis(2, at=seq(40000, 105000, 10000), labels=format(seq(40, 105, 10)), las=2)
(a) The forecaster decided that there is an exponential trend in the series. In order to fit a regression-based model that accounts for this trend, which of the following operations must be performed (either manually or by a function in R)?
. Take a logarithm of the Quarter index: No
. Take a logarithm of sales: Yes In R this is completed by setting lambda = 0 in the tslm function.
. Take an exponent of sales: No
. Take an exponent of Quarter index: No
(b) Fit a regression model with an exponential trend and seasonality, using only the first 20 quarters as the training period (remember to first partition the series into training and validation periods).
SvalidLength <- 4
StrainLength <- length(DeptSales.ts) - SvalidLength
#windows for analysis
sWindowTrain <- window(DeptSales.ts, end = c(1, StrainLength))
sWindowValid <- window(DeptSales.ts, start = c(1, StrainLength + 1))
salesExpoSeason <- tslm(sWindowTrain ~ trend + season, lambda = 0)
summary(salesExpoSeason)
##
## Call:
## tslm(formula = sWindowTrain ~ trend + season, lambda = 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.053524 -0.013199 -0.004527 0.014387 0.062681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.748945 0.018725 574.057 < 2e-16 ***
## trend 0.011088 0.001295 8.561 3.70e-07 ***
## season2 0.024956 0.020764 1.202 0.248
## season3 0.165343 0.020884 7.917 9.79e-07 ***
## season4 0.433746 0.021084 20.572 2.10e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03277 on 15 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 7.63e+11 on 4 and 15 DF, p-value: < 2.2e-16
#ggseasonplot(DeptSales.ts)
(c) A partial output is shown in Table 6.7. From the output, after adjusting for trend, are Q2 average sales higher, lower, or approximately equal to the average Q1 sales?
As shown on the Dept Store sales regression summary with exponential trend and seasonality, Quarter 2 average sales are approximately equal to those of Quarter 1, i.e., they are not statisticaly significantly different from the Quarter 1 base.
(d) Use this model to forecast sales in quarters 21 and 22.
#forecast 21 and 22
forecastSalesExpoSeason <- forecast(salesExpoSeason, h = 2)
forecastSalesExpoSeason
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 6 Q1 58793.71 55790.19 61958.92 54090.84 63905.46
## 6 Q2 60951.51 57837.76 64232.89 56076.04 66250.87
(e) The plots shown in Figure 6.13 describe the fit (top) and forecast errors (bottom) from this regression model.
i. Recreate these plots.
#yrange <- range(sWindowTrain)
yrange <- range(sWindowTrain)
plot(c(1,6), yrange, type = "n", xlab = "Year", ylab = "Dept Store Sales (thousands)", bty = "l", xaxt = "n", yaxt = "n", lwd = 2, main = "Regression with Exponential Trend and Seasonality \nDept Store Sales")
axis(1, at = seq(1, 6, 1), labels = format(seq(1, 6, 1)), minor.tick(nx = 4))
axis(2, at = seq(40000, 105000, 10000), labels = format(seq(40, 105, 10)), las = 2)
lines(sWindowTrain, bty = "l", lwd = 2, type = "o")
lines(salesExpoSeason$fitted.values, col="blue", lwd=2)
legend(1, 105000, c("Dept Store Sales", "Expo"), lty = c(1,2,2), col = c("black", "blue"), lwd = c(2,2), bty = "n")
plot(sWindowTrain - salesExpoSeason$fitted.values, type="o", bty="l", xlab = "Year", ylab = "Residuals", main = "Dept Store Sales Residuals in Training Period \nExponential Trend with Seasonality Fit")
abline(h = 0)
ii. Based on these plots, what can you say about your forecasts for quarters Q21 and Q22? Are they likely to overforecast, under-forecast, or be reasonably close to the real sales values?
Years 2-4 were over forecasted. However, as we see above, Q-2-4 in Year 5 have been underforecasted. I would expect Q-1 and Q-2 in Year 6 will be the same, i.e., underforecasted. A plot of the residuals from Year 6 (see below) confirms that there is an increasing underforecasting. Below that, the raw data from the validation is plotted against the forecast for the same period to look at the accuracy in another way. Year 6 is underforecasted using the exponential trend with seasonality model.
plot(sWindowValid - forecastSalesExpoSeason4$mean, type="o", bty="l", xlab = "Year 6", ylab = "Residuals", main = "Dept Store Sales Residuals from Validation Period \nExponential Trend with Seasonality Fit", xaxt = "n", ylim = c(-1000, 9000))
axis(1, at = seq(6, 6+3/4, 1/4), labels = c("Q-1", "Q-2", "Q-3", "Q-4"))
abline(h = 0, lwd = 3)
yrange <- range(sWindowValid)
y.low <- c(58793.71/1000, 60951.51/1000, 70920.09/1000, 93788.66/1000)
y.high <- c(60800/1000, 64900/1000, 76997/1000, 103337/1000)
polygon(c(y.high, y.low),
col = "grey30", border = NA)
plot(c(6,7), yrange, type = "n", xlab = "Year", ylab = "Dept Store Sales (thousands)", bty = "l", xaxt = "n", yaxt = "n", lwd = 2, main = "Regression with Exponential Trend and Seasonality \nValidation Period Dept Store Sales", ylim = c(50000, 105000))
axis(1, at = seq(6, 6+3/4, 1/4), labels = c("Q-1", "Q-2", "Q-3", "Q-4"))
axis(2, at=seq(40000, 105000, 10000), labels=format(seq(40, 105, 10)), las=2)
#lines(sWindowTrain, bty = "l", lwd = 2, type = "o")
lines(sWindowValid, bty = "l", lwd = 2, type = "o")
lines(forecastSalesExpoSeason4$mean, col="blue", lwd=2, type = "o")
polygon(y.high, y.low,
col = "grey30", border = NA)
#lines(forecastSalesExpoSeason$mean, col = 2, lty = 2, lwd = 2)
#lines(maForecasts, lwd=2, col="red", lty=2)
legend(6.5, 75000, c("Dept Store Sales", "Forecasted Values"), lty = c(1,1), col = c("black", "blue"), lwd = c(2,2), bty = "n")
The accuracy of the two windows is shown below. The validation period has a higher MAPE and RMSE that the test period.
accuracyTrain <- accuracy(sWindowTrain, salesExpoSeason$fitted.values)
accuracyValid <- accuracy(sWindowValid, forecastSalesExpoSeason4$mean)
pander(pandoc.table(accuracyTrain, caption = "Error: Dept Store Sales Test Set (Years 1-5)"))
|  | ME | RMSE | MAE | MPE | MAPE | ACF1 | Theil’s U |
|---|---|---|---|---|---|---|---|
| Test set | -30.63 | 1759 | 1389 | -0.0404 | 2.224 | 0.5541 | 0.1149 |
pander(pandoc.table(accuracyValid, caption = "Error: Dept Store Sales Validation Set (Year 6)"))
|  | ME | RMSE | MAE | MPE | MAPE | ACF1 | Theil’s U |
|---|---|---|---|---|---|---|---|
| Test set | -5395 | 6077 | 5395 | -7.16 | 7.16 | 0.2156 | 0.4966 |
(f) Looking at the residual plot, which of the following statements appear true?
. Seasonality is not captured well. False
. The regression model fits the data well. True
. The trend in the data is not captured well by the model. False
(g) Which of the following solutions is adequate and a parsimonious solution for improving model fit?
. Fit a quadratic trend model to the residuals (with Quarter and Quarter2.): This is not an adequate solution.
. Fit a quadratic trend model to Sales (with Quarter and Quarter2.): This is an adequate and parsimonious solution.
5. Souvenir Sales: The file SouvenirSales.xls contains monthly sales for a souvenir shop at a beach resort town in Queensland, Australia, between 1995 and 2001. 9 9 Source: R. J. Hyndman Time Series Data Library, http://data.is/TSDLdemo; accessed on Mar 28, 2016 Beach Resort. (Image by quyenlan/ FreeDigitalPhotos.net) Back in 2001, the store wanted to use the data to forecast sales for the next 12 months (year 2002). They hired an analyst to generate forecasts. The analyst first partitioned the data into training and validation periods, with the validation set containing the last 12 months of data (year 2001). She then fit a regression model to sales, using the training period.
setwd("/Users/Chris Iyer/Documents/Assignment5Predictive/")
souvenirSales<- read.csv("SouvenirSales.csv")
souvenir.ts <- ts(souvenirSales$Sales, start = c(1995,1), frequency = 12)
validLength <- 12
trainLength <- length(souvenir.ts) - validLength
souvenirTrain <- window(souvenir.ts, end=c(1995, trainLength))
souvenirValid <- window(souvenir.ts, start=c(1995,trainLength+1))
(a) Based on the two time plots in Figure 6.14, which predictors should be included in the regression model? What is the total number of predictors in the model?
Based on the 2 charts below, which are representations of those shown in the textbook, there are two important predictors, trend and seasonality. Trend is one variable and there are 11 dummy variables for seasons 2-12. There are a total of 12 variables.
plot(souvenir.ts, type = "n", xaxt = "n", yaxt = "n", xlab = "Year", ylab = "Sales (thousands)", main = "Souvenir Sales")
lines(souvenir.ts, col = "grey48", lwd = 2, lty = 1, pch = 18, type = "o")
# Add the x-axis
axis(1, at=seq(1995,2002,1), labels=format(seq(1995,2002,1)))
axis(2, at = seq(0, 110000, 20000), labels = format(seq(0, 110, 20)), las = 2)
plot(log(souvenir.ts), type = "n", xaxt = "n", yaxt = "n", xlab = "Year", ylab = "Souvenir Sales (thousands)", main = "Log Souvenir Sales")
lines(log(souvenir.ts), col = "grey48", lwd = 2, lty = 1, pch = 18, type = "o")
# Add the x-axis
axis(1, at=seq(1995,2002,1), labels=format(seq(1995,2002,1)))
# Add the y-axis
axis(2, at = seq(7, 11, 1), labels = format(round(exp(seq(7, 11, 1)), digits = 0), las = 2))
(b) Run a regression model with Sales (in Australian dollars) as the output variable and with a linear trend and monthly predictors. Remember to fit only the training period. Call this model A.
Model A
#Run Regression Model
modelA <- tslm(souvenirTrain ~ trend + season)
SumA <- summary(modelA)
SumA
##
## Call:
## tslm(formula = souvenirTrain ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12592 -2359 -411 1940 33651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3065.55 2640.26 -1.161 0.25029
## trend 245.36 34.08 7.199 1.24e-09 ***
## season2 1119.38 3422.06 0.327 0.74474
## season3 4408.84 3422.56 1.288 0.20272
## season4 1462.57 3423.41 0.427 0.67077
## season5 1446.19 3424.60 0.422 0.67434
## season6 1867.98 3426.13 0.545 0.58766
## season7 2988.56 3427.99 0.872 0.38684
## season8 3227.58 3430.19 0.941 0.35058
## season9 3955.56 3432.73 1.152 0.25384
## season10 4821.66 3435.61 1.403 0.16573
## season11 11524.64 3438.82 3.351 0.00141 **
## season12 32469.55 3442.36 9.432 2.19e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5927 on 59 degrees of freedom
## Multiple R-squared: 0.7903, Adjusted R-squared: 0.7476
## F-statistic: 18.53 on 12 and 59 DF, p-value: 9.435e-16
plot(souvenir.ts, type = "n", xaxt = "n",
yaxt = "n",
xlab = "Year", ylab = " Souvenir Sales (thousands)", main = "Model A \nRegression Model with Trend and Seasonality")
lines(souvenirTrain, col = "black", lwd = 2)
lines(modelA$fitted.values, col = "blue", lwd = 2)
# Add the x-axis
axis(1, at=seq(1995,2002,1), labels=format(seq(1995,2002,1)))
axis(2, at = seq(0, 110000, 20000), labels = format(seq(0, 110, 20)), las = 2)
legend(1996, 100000, c("Actual Souvenir Sales", "Regression Fit Trend Seasonality"), col = c("black", "blue"), lwd = c(2,2), bty = "n", lty = c(1,1))
Training Window Model A Accuracy
#Forecast
FCA <- forecast(modelA, h = 2)
#Accuracy
AccA <- accuracy(modelA$fitted.values, souvenirTrain)
pander(AccA)
|  | ME | RMSE | MAE | MPE | MAPE | ACF1 | Theil’s U |
|---|---|---|---|---|---|---|---|
| Test set | -5.684e-14 | 5365 | 3205 | 6.968 | 36.75 | 0.4048 | 1.02 |
i. Examine the coefficients: Which month tends to have the highest average sales during the year? Why is this reasonable?
December. This seems reasonable because it’s summer in Australia and the height of the tourist season. December is one of the only months whose sales is statistically significantly higher than the base month.
ii. What does the trend coefficient of model A mean?
The trend coeffieient on the raw data, 245.36, means that over time, souvenir sales increase by that many each month on average holding all other variables constant.
(c) Run a regression model with log(Sales) as the output variable and with a linear trend and monthly predictors. Remember to fit only the training period. Call this model B.
#exponential seasonal
modelB <- tslm(log(souvenirTrain) ~ trend + season)
summary(modelB)
##
## Call:
## tslm(formula = log(souvenirTrain) ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4529 -0.1163 0.0001 0.1005 0.3438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.646363 0.084120 90.898 < 2e-16 ***
## trend 0.021120 0.001086 19.449 < 2e-16 ***
## season2 0.282015 0.109028 2.587 0.012178 *
## season3 0.694998 0.109044 6.374 3.08e-08 ***
## season4 0.373873 0.109071 3.428 0.001115 **
## season5 0.421710 0.109109 3.865 0.000279 ***
## season6 0.447046 0.109158 4.095 0.000130 ***
## season7 0.583380 0.109217 5.341 1.55e-06 ***
## season8 0.546897 0.109287 5.004 5.37e-06 ***
## season9 0.635565 0.109368 5.811 2.65e-07 ***
## season10 0.729490 0.109460 6.664 9.98e-09 ***
## season11 1.200954 0.109562 10.961 7.38e-16 ***
## season12 1.952202 0.109675 17.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1888 on 59 degrees of freedom
## Multiple R-squared: 0.9424, Adjusted R-squared: 0.9306
## F-statistic: 80.4 on 12 and 59 DF, p-value: < 2.2e-16
plot(souvenirTrain, main = "Model B \nLog(Souvenir Sales) with Trend and Seasonality", lwd = 3, ylab = "Souvenir Sales")
# axis(2, at = seq(0, 100000, exp(10)), labels = format(seq(0, 100000, exp(10))), las = 2)
lines(exp(modelB$fitted.values), lwd = 2, col = "red")
#lines(modelA$fitted.values, col = "blue", lwd = 2)
legend(1996, 80000, c("Actual Souvenir Sales", "Exponential Fit (T&S)"),
#"Raw Fit (T&S)" ),
col = c("black", "red", "blue"), lwd = c(3,2, 2), bty = "n", lty = c(1,1, 1))
Training Window Model B Accuracy
pander(accuracy(exp(modelB$fitted.values), souvenirTrain))
|  | ME | RMSE | MAE | MPE | MAPE | ACF1 | Theil’s U |
|---|---|---|---|---|---|---|---|
| Test set | 197.5 | 2865 | 1671 | -1.473 | 13.94 | 0.4381 | 0.4086 |
i. Fitting a model to log(Sales) with a linear trend is equivalent to fitting a model to Sales (in dollars) with what type of trend?
Exponential trend, not multiplicative.
ii. What does the estimated trend coefficient of model B mean?
trend = 0.021120 means that sales of souvenirs go up by 2.1% each month on average.
iii. Use this model to forecast the sales in February 2002.
FCB <- forecast(modelB, h = 2)
pander(exp(FCB$mean))
| Â | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2001 | 9780 | 13243 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Model B, log(Souvenir Sales), Forecast Accuracy
pander(accuracy(exp(FCB$mean), souvenirValid[1:2]))
| Â | ME | RMSE | MAE | MPE | MAPE |
|---|---|---|---|---|---|
| Test set | -756.5 | 1435 | 1220 | -6.509 | 11.03 |
(d) Compare the two regression models (A and B) in terms of forecast performance. Which model is preferable for forecasting? Mention at least two reasons based on the information in the outputs.
Forecast Performance Model A
pander(accuracy(FCA$mean, souvenirValid[1:2]))
| Â | ME | RMSE | MAE | MPE | MAPE |
|---|---|---|---|---|---|
| Test set | -4773 | 4776 | 4773 | -44.41 | 44.41 |
Model B’s forecast, which fits the Australian Souvenir Sales data to an exponential trend with seasonality, performs better than Model As, which fits the data with a linear trend with seasonality, because both the forecast RMSE and MAPE errors are lower. The adjusted R-sq can be used as a measurement of how well the series fits the model. This is a parameter we may use when deciding which forecast model might work best. is better in Model B.
(e) How would you model this data differently if the goal was understanding the different components of sales in the souvenir shop between 1995 and 2001? Mention two differences.
If I wanted to explore the trend component of the Souvenir sales series, I would do a linear and a quadratic regression model. The adjusted R-sq explain the variation in sales that can be explained by the linear and the quadratic trend respectively. The lines on the plot isolate the trends.
yrange <- range(souvenirTrain)
souvenirTrend <- tslm(souvenirTrain ~ trend)
souvenirQuad <- tslm(souvenirTrain ~ trend + I(trend^2))
summary(souvenirTrend)
##
## Call:
## tslm(formula = souvenirTrain ~ trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11178 -5873 -1866 1286 58728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1044.63 2423.89 0.431 0.668
## trend 290.96 57.71 5.042 3.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10180 on 70 degrees of freedom
## Multiple R-squared: 0.2664, Adjusted R-squared: 0.2559
## F-statistic: 25.42 on 1 and 70 DF, p-value: 3.474e-06
summary(souvenirQuad)
##
## Call:
## tslm(formula = souvenirTrain ~ trend + I(trend^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -13325 -4550 -1733 598 52846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7438.434 3585.130 2.075 0.0417 *
## trend -227.458 226.644 -1.004 0.3191
## I(trend^2) 7.102 3.009 2.360 0.0211 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9860 on 69 degrees of freedom
## Multiple R-squared: 0.3212, Adjusted R-squared: 0.3015
## F-statistic: 16.33 on 2 and 69 DF, p-value: 1.566e-06
plot(c(1995, 2001), yrange, type="n", xlab="Year", ylab="Souvenir Sales", bty="l", xaxt="n", yaxt="n", main = "Trend Component in the Souvenir Sales data")
# Add the x-axis
axis(1, at=seq(1995,2001,1), labels=format(seq(1995,2001,1)))
axis(2, at=seq(1500,82000,15000), labels=format(seq(1500,82000,15000)), las=2)
lines(souvenirTrain, lwd = 2)
lines(souvenirTrend$fitted.values, col = "red", lwd = 2)
lines(souvenirQuad$fitted.values, col = "blue", lwd = 2)
legend(1995, 80000, c("Souvenir Sales", "Linear Trend MAPE = 26%", "Quad Trend MAPE = 30%"), col = c("black", "red", "blue"), lwd = c(2, 2, 2), bty = "n")
If I wanted to understand the seasonal component, I would do a straight linear regression using seasonality as the dependent variable. The R-sq output explains the variation in souvenir sales using this seasonality model and the red line on the plot isolates the seasonal component in the data set.
souvenirSeason <- tslm(souvenirTrain ~ season)
summary(souvenirSeason)
##
## Call:
## tslm(formula = souvenirTrain ~ season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19953 -3198 -794 2130 41012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4541 3288 1.381 0.17246
## season2 1365 4651 0.293 0.77019
## season3 4900 4651 1.054 0.29632
## season4 2199 4651 0.473 0.63810
## season5 2428 4651 0.522 0.60359
## season6 3095 4651 0.665 0.50831
## season7 4461 4651 0.959 0.34132
## season8 4945 4651 1.063 0.29190
## season9 5918 4651 1.273 0.20806
## season10 7030 4651 1.512 0.13588
## season11 13978 4651 3.006 0.00386 **
## season12 35169 4651 7.562 2.74e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8055 on 60 degrees of freedom
## Multiple R-squared: 0.6061, Adjusted R-squared: 0.5338
## F-statistic: 8.391 on 11 and 60 DF, p-value: 1.058e-08
yrange <- range(souvenirTrain)
yrange
## [1] 1664.81 80721.71
plot(c(1995, 2001), yrange, type="n", xlab="Year", ylab="Souvenir Sales", bty="l", xaxt="n", yaxt="n", main = "Seasonal Component in the Souvenir Sales Data")
# Add the x-axis
axis(1, at=seq(1995,2001,1), labels=format(seq(1995,2001,1)))
axis(2, at=seq(1500,82000,15000), labels=format(seq(1500,82000,15000)), las=2)
lines(souvenirTrain, lwd = 2)
lines(souvenirSeason$fitted.values, col = "red", lwd = 2)
legend(1995, 80000, c("Souvenir Sales", "Seasonality MAPE = 53 %"), col = c("black", "red"), lwd = c(2, 2), bty = "n")