library(forecast)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 7.3
library(readxl)
Canada_Hours <- read_excel("~/rProjects/Assignment_5/Canada_Hours.xlsx")
# Create a time series object out of it
hours.ts <- ts(Canada_Hours$HoursPerWeek, start=c(1966, 1), frequency=1)
# Linear trend
hoursLinear <- tslm(hours.ts ~ trend)
summary(hoursLinear)
##
## Call:
## tslm(formula = hours.ts ~ trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.20457 -0.28761 0.04779 0.30210 1.23190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.416134 0.175818 212.811 < 2e-16 ***
## trend -0.061373 0.008518 -7.205 2.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.509 on 33 degrees of freedom
## Multiple R-squared: 0.6113, Adjusted R-squared: 0.5996
## F-statistic: 51.91 on 1 and 33 DF, p-value: 2.928e-08
# Quadratic trend - like the book
hoursQuad1 <- tslm(hours.ts ~ trend + I(trend^2))
summary(hoursQuad1)
##
## Call:
## tslm(formula = hours.ts ~ trend + I(trend^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.94503 -0.20964 -0.01652 0.31862 0.60160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.1644156 0.2176599 175.340 < 2e-16 ***
## trend -0.1827154 0.0278786 -6.554 2.21e-07 ***
## I(trend^2) 0.0033706 0.0007512 4.487 8.76e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4049 on 32 degrees of freedom
## Multiple R-squared: 0.7614, Adjusted R-squared: 0.7465
## F-statistic: 51.07 on 2 and 32 DF, p-value: 1.1e-10
yrange = range(hours.ts)
# Set up the plot
plot(c(1966, 2000), yrange, type="n", xlab="Year", ylab="Hours Worked Per Week", bty="l", xaxt="n", yaxt="n")
# Add the time series air
lines(hours.ts, bty="l")
# Add the x-axis
axis(1, at=seq(1965,2000,5), labels=format(seq(1965,2000,5)))
# Add the y-axis
axis(2, at=seq(34.5, 38.0, .5), labels=format(seq(34.5, 38.0, .5)), las=2)
# Add the linear trend model
lines(hoursLinear$fitted, col="red")
lines(hoursQuad1$fitted, col="blue")
# Add a legend
legend(1990,37.5, c("Hours Worked", "Linear", "Quadratic"), lty=c(1,1,1,1,1), col=c("black", "red", "blue"), bty="n")

Question 2
The Quadratic Trend Model would best fit the hours-worked data series. The answer can easily be seen in both the plot above as well as in the model summaries. The plot shows how the quadratic trend model more accurately fits the data and the summary for the model displays a much higher R-squared value, which implies a better fit.
The two options that took seasonality into account were immediately ruled out because this data series provides values on a yearly basis. In order to address seasonality, we would need to break the yearly data down into time periods that fall within a year (days, weeks, months, quarters).
Question 4
A)
In order to fit an exponential trend the log(y) must be calculated; in our example take the logarithm of sales.
B)
DeptStoreSales <- read_excel("~/rProjects/Assignment_5/DeptStoreSales.xlsx")
sales.ts <- ts(DeptStoreSales$Sales, start=c(1), frequency=4)
validLength <- 4
trainLength <- length(sales.ts) - validLength
salesTrain <- window(sales.ts, end=c(1, trainLength))
salesValid <- window(sales.ts, start=c(1,trainLength+1))
salesExpo <- tslm(salesTrain ~ trend + season, lambda=0)
summary(salesExpo)
##
## Call:
## tslm(formula = salesTrain ~ trend + season, lambda = 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.053524 -0.013199 -0.004527 0.014387 0.062681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.748945 0.018725 574.057 < 2e-16 ***
## trend 0.011088 0.001295 8.561 3.70e-07 ***
## season2 0.024956 0.020764 1.202 0.248
## season3 0.165343 0.020884 7.917 9.79e-07 ***
## season4 0.433746 0.021084 20.572 2.10e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03277 on 15 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 7.63e+11 on 4 and 15 DF, p-value: < 2.2e-16
validPeriodForecasts <- forecast(salesExpo, h=validLength)
validPeriodForecasts
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 6 Q1 58793.71 55790.19 61958.92 54090.84 63905.46
## 6 Q2 60951.51 57837.76 64232.89 56076.04 66250.87
## 6 Q3 70920.09 67297.09 74738.13 65247.24 77086.15
## 6 Q4 93788.66 88997.41 98837.86 86286.57 101943.01
# Helps set up the plot
yrange2 = range(sales.ts)
# Set up the plot
plot(c(1, 7), yrange2, type="n", xlab="Year", ylab="Sales (thousands)", bty="l", xaxt="n", yaxt="n")
# Add the time series
lines(sales.ts, bty="l")
# Add the x-axis
axis(1, at=seq(1,7,1), labels=format(seq(1,7,1)))
# Add the y-axis
axis(2, at=seq(40000,105000,10000), labels=format(seq(40,105,10)), las=2)
# Add fitted line from training period
lines(salesExpo$fitted, col="red")
# Add forecasts for valiation period
lines(validPeriodForecasts$mean, col="blue", lty=2)
# Add the legend
legend(1,100000, c("Sales", "Exponential Trend+Season", "Forecasts"), lty=c(1,1,2), col=c("black", "red", "blue"), bty="n")

D)
Below are the forcasted values for quarters 21 and 22, the first two periods of the validation period:
# Print the forecasted sales for Q21 & Q22
validPeriodForecasts$mean[1:2]
## 1 2
## 58793.71 60951.51
E)
Looking at the residuals, I would say that the model is likely to over-forecast compared to the real values. The traning period consists of 20 values, and the residuals show that we over-forecast 11 times (negative numbers) and under-forecast 9 times (positive numbers), so the model tends to over-forecast slightly more. However, if we plot the validation period residuals, we see that our model actually under-forecasts.
# Helps set up the plot
yrange3 = range(sales.ts)
# Set up the plot
plot(c(1, 5.75), yrange3, type="n", xlab="Year", ylab="Sales (thousands)", bty="l", xaxt="n", yaxt="n")
# Add the time series
lines(salesTrain, bty="l")
# Add the x-axis
axis(1, at=seq(1,5.75,1), labels=format(seq(1,5.75,1)))
# Add the y-axis
axis(2, at=seq(40000,105000,10000), labels=format(seq(40,105,10)), las=2)
# Add fitted line from training period
lines(salesExpo$fitted, col="red")
# Add the legend
legend(1,100000, c("Sales", "Exponential Trend+Season"), lty=c(1,1,2), col=c("black", "red"), bty="n")

# Plot residuals using actual values instead of log-transformed units
plot(salesTrain - salesExpo$fitted, type="o", bty="l")
# Add a line at y = 0 to help identitify positive and negative values
abline(h = 0, col = "red")

# Plot the residuals for the validation period
plot(salesValid - validPeriodForecasts$mean, type="o", bty="l")
abline(h = 0, col = "red")

F)
Based on the training period residual plot only, I would say that the regression model fits the data well because there are almost an equal number of points above and below y = 0 and the values stay in the range [-3000,3000]; the data points are not skewed in any direction and there are no discernable patterns.
G)
I would fit a quadratic trend model to Sales with Quarter and Quarter ^2.
Question 5
A)
There needs to be 13 predictors in all: 11 dummy variables for month, and t and t^2 for trend. This is because the data series has both trend and seasonality.
SouvenirSales <- read_excel("~/rProjects/Assignment_5/SouvenirSales.xlsx")
sales2.ts <- ts(SouvenirSales$Sales, start=c(1995,1), frequency=12)
validLength2 <- 12
trainLength2 <- length(sales2.ts) - validLength2
salesTrain2 <- window(sales2.ts, end=c(1995, trainLength2))
salesValid2 <- window(sales2.ts, start=c(1995,trainLength2+1))
B)
i)It appears that November and December have the highest average sales during the year. This is reasonable because many religious holidays occur at the end of the year - this means that people have time off work to visit the beach and gifts tend to play a big part of the holidays.
ii)The trend coefficient in model A indicates that sales are likely to increase by $245.36 each month - indicating an upward trend.
# Fit the model to the training set
salesLinearSeason2 <- tslm(salesTrain2 ~ trend + season)
# See the estimated regression equation
summary(salesLinearSeason2)
##
## Call:
## tslm(formula = salesTrain2 ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12592 -2359 -411 1940 33651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3065.55 2640.26 -1.161 0.25029
## trend 245.36 34.08 7.199 1.24e-09 ***
## season2 1119.38 3422.06 0.327 0.74474
## season3 4408.84 3422.56 1.288 0.20272
## season4 1462.57 3423.41 0.427 0.67077
## season5 1446.19 3424.60 0.422 0.67434
## season6 1867.98 3426.13 0.545 0.58766
## season7 2988.56 3427.99 0.872 0.38684
## season8 3227.58 3430.19 0.941 0.35058
## season9 3955.56 3432.73 1.152 0.25384
## season10 4821.66 3435.61 1.403 0.16573
## season11 11524.64 3438.82 3.351 0.00141 **
## season12 32469.55 3442.36 9.432 2.19e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5927 on 59 degrees of freedom
## Multiple R-squared: 0.7903, Adjusted R-squared: 0.7476
## F-statistic: 18.53 on 12 and 59 DF, p-value: 9.435e-16
C)
i)Fitting a model to log(Sales) with a linear trend is equivalent to fitting a model to Sales (in dollars) with an exponential trend.
ii)The trend coefficient of model B indicates that monthly sales will increase by approximately 2.11% through time.
iii)The forecast for February 2002 is below; our model predicts that sales that month will be $17062.99. I had 84 time periods in this data series, so February of year beyond the validation period would be 86 which is why I multiplied my trend coefficient by 86.
# Regression Model with log(sales) - Model B
modelB <- tslm(log(salesTrain2) ~ trend + season)
summary(modelB)
##
## Call:
## tslm(formula = log(salesTrain2) ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4529 -0.1163 0.0001 0.1005 0.3438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.646363 0.084120 90.898 < 2e-16 ***
## trend 0.021120 0.001086 19.449 < 2e-16 ***
## season2 0.282015 0.109028 2.587 0.012178 *
## season3 0.694998 0.109044 6.374 3.08e-08 ***
## season4 0.373873 0.109071 3.428 0.001115 **
## season5 0.421710 0.109109 3.865 0.000279 ***
## season6 0.447046 0.109158 4.095 0.000130 ***
## season7 0.583380 0.109217 5.341 1.55e-06 ***
## season8 0.546897 0.109287 5.004 5.37e-06 ***
## season9 0.635565 0.109368 5.811 2.65e-07 ***
## season10 0.729490 0.109460 6.664 9.98e-09 ***
## season11 1.200954 0.109562 10.961 7.38e-16 ***
## season12 1.952202 0.109675 17.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1888 on 59 degrees of freedom
## Multiple R-squared: 0.9424, Adjusted R-squared: 0.9306
## F-statistic: 80.4 on 12 and 59 DF, p-value: < 2.2e-16
# Use the named coefficients
febForecast <- modelB$coefficients["(Intercept)"] + modelB$coefficients["trend"]*86 + modelB$coefficients["season2"]
# Convert them back and simply echo them
exp(febForecast)
## (Intercept)
## 17062.99
D)
Model B is preferable for forecasting because:
1. The MAPE value for model B is considerably lower than model A; 15.52% verses 26.67%
2. The MAE/MAD (Mean Absolute Error/Deviation) for model B is almost half as model A, $5,191.67 verses $10,055.28
Both of the reasons above indicate that model B is more accurate and should be used to forecast this series.
modelAForecast <- forecast(salesLinearSeason2, h=validLength2)
accuracy(modelAForecast, salesValid2)
## ME RMSE MAE MPE MAPE MASE
## Training set -5.684342e-14 5365.199 3205.089 6.967778 36.75088 0.855877
## Test set 8.251513e+03 17451.547 10055.276 10.533974 26.66568 2.685130
## ACF1 Theil's U
## Training set 0.4048039 NA
## Test set 0.3206228 0.9075924
modelBForecast <- forecast(modelB, h=validLength2)
# We have to tranform the forecasts back to the original scale.
accuracy(exp(modelBForecast$mean), salesValid2)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 4824.494 7101.444 5191.669 12.35943 15.5191 0.4245018 0.4610253
E)
1. I would plot the data separately by month in order to better identify and visualize monthly trends.
2. If daily sales data were available, I would look for correlations between daily sales and average daily temperature to see if that has an impact on sales. This may tell the owners that on colder days people tend to come inside to shop while they perfer to stay out on the beach on warmer days.
3. I may attempt a smoothing techinque, averaging the previous month, current month and next month to better see sales averages over the years.