library(forecast)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 7.3
library(readxl)
Canada_Hours <- read_excel("~/rProjects/Assignment_5/Canada_Hours.xlsx")

# Create a time series object out of it
hours.ts <- ts(Canada_Hours$HoursPerWeek, start=c(1966, 1), frequency=1)
# Linear trend
hoursLinear <- tslm(hours.ts ~ trend)
summary(hoursLinear)
## 
## Call:
## tslm(formula = hours.ts ~ trend)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.20457 -0.28761  0.04779  0.30210  1.23190 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.416134   0.175818 212.811  < 2e-16 ***
## trend       -0.061373   0.008518  -7.205 2.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.509 on 33 degrees of freedom
## Multiple R-squared:  0.6113, Adjusted R-squared:  0.5996 
## F-statistic: 51.91 on 1 and 33 DF,  p-value: 2.928e-08
# Quadratic trend - like the book
hoursQuad1 <- tslm(hours.ts ~ trend + I(trend^2))
summary(hoursQuad1)
## 
## Call:
## tslm(formula = hours.ts ~ trend + I(trend^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.94503 -0.20964 -0.01652  0.31862  0.60160 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 38.1644156  0.2176599 175.340  < 2e-16 ***
## trend       -0.1827154  0.0278786  -6.554 2.21e-07 ***
## I(trend^2)   0.0033706  0.0007512   4.487 8.76e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4049 on 32 degrees of freedom
## Multiple R-squared:  0.7614, Adjusted R-squared:  0.7465 
## F-statistic: 51.07 on 2 and 32 DF,  p-value: 1.1e-10
yrange = range(hours.ts)

# Set up the plot
plot(c(1966, 2000), yrange, type="n", xlab="Year",  ylab="Hours Worked Per Week", bty="l", xaxt="n", yaxt="n")

# Add the time series air
lines(hours.ts, bty="l")

# Add the x-axis
axis(1, at=seq(1965,2000,5), labels=format(seq(1965,2000,5)))

# Add the y-axis
axis(2, at=seq(34.5, 38.0, .5), labels=format(seq(34.5, 38.0, .5)), las=2)

# Add the linear trend model
lines(hoursLinear$fitted, col="red")
lines(hoursQuad1$fitted, col="blue")

# Add a legend
legend(1990,37.5, c("Hours Worked", "Linear", "Quadratic"), lty=c(1,1,1,1,1), col=c("black", "red", "blue"), bty="n")


Question 2

The Quadratic Trend Model would best fit the hours-worked data series. The answer can easily be seen in both the plot above as well as in the model summaries. The plot shows how the quadratic trend model more accurately fits the data and the summary for the model displays a much higher R-squared value, which implies a better fit.
The two options that took seasonality into account were immediately ruled out because this data series provides values on a yearly basis. In order to address seasonality, we would need to break the yearly data down into time periods that fall within a year (days, weeks, months, quarters).

Question 4

A)

In order to fit an exponential trend the log(y) must be calculated; in our example take the logarithm of sales.

B)

DeptStoreSales <- read_excel("~/rProjects/Assignment_5/DeptStoreSales.xlsx")

sales.ts <- ts(DeptStoreSales$Sales, start=c(1), frequency=4)

validLength <- 4
trainLength <- length(sales.ts) - validLength

salesTrain <- window(sales.ts, end=c(1, trainLength))
salesValid <- window(sales.ts, start=c(1,trainLength+1))
salesExpo <- tslm(salesTrain ~ trend + season, lambda=0)
summary(salesExpo)
## 
## Call:
## tslm(formula = salesTrain ~ trend + season, lambda = 0)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.053524 -0.013199 -0.004527  0.014387  0.062681 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.748945   0.018725 574.057  < 2e-16 ***
## trend        0.011088   0.001295   8.561 3.70e-07 ***
## season2      0.024956   0.020764   1.202    0.248    
## season3      0.165343   0.020884   7.917 9.79e-07 ***
## season4      0.433746   0.021084  20.572 2.10e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03277 on 15 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 7.63e+11 on 4 and 15 DF,  p-value: < 2.2e-16
validPeriodForecasts <- forecast(salesExpo, h=validLength)
validPeriodForecasts
##      Point Forecast    Lo 80    Hi 80    Lo 95     Hi 95
## 6 Q1       58793.71 55790.19 61958.92 54090.84  63905.46
## 6 Q2       60951.51 57837.76 64232.89 56076.04  66250.87
## 6 Q3       70920.09 67297.09 74738.13 65247.24  77086.15
## 6 Q4       93788.66 88997.41 98837.86 86286.57 101943.01
# Helps set up the plot
yrange2 = range(sales.ts)

# Set up the plot
plot(c(1, 7), yrange2, type="n", xlab="Year",  ylab="Sales (thousands)", bty="l", xaxt="n", yaxt="n")

# Add the time series
lines(sales.ts, bty="l")

# Add the x-axis
axis(1, at=seq(1,7,1), labels=format(seq(1,7,1)))

# Add the y-axis
axis(2, at=seq(40000,105000,10000), labels=format(seq(40,105,10)), las=2)

# Add fitted line from training period
lines(salesExpo$fitted, col="red")

# Add forecasts for valiation period
lines(validPeriodForecasts$mean, col="blue", lty=2)

# Add the legend
legend(1,100000, c("Sales", "Exponential Trend+Season", "Forecasts"), lty=c(1,1,2), col=c("black", "red", "blue"), bty="n")


C)

To determine if Q2 average sales are higher, lower or approximately equal to the average Q1 sales, I first looked at the P-values. Notice that season 2, which represents Q2, has a P-value of .248; because this number is not exceptionally small, it indicates that Q2 is not statistically, significantly different from Q1 (which is our base season). Also, looking at the estimate of season 2, even though it is still in log form, I would say that Q2 tends to be slightly higher than Q1 because of its positive value, .024956. When comparing this assessment to the actual values, I find it to be accurate; Q2 tends to be slightly higher than Q1, but not in all cases and not by a significant amount.
summary(tslm(salesTrain ~ trend + season, lambda=0))
## 
## Call:
## tslm(formula = salesTrain ~ trend + season, lambda = 0)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.053524 -0.013199 -0.004527  0.014387  0.062681 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.748945   0.018725 574.057  < 2e-16 ***
## trend        0.011088   0.001295   8.561 3.70e-07 ***
## season2      0.024956   0.020764   1.202    0.248    
## season3      0.165343   0.020884   7.917 9.79e-07 ***
## season4      0.433746   0.021084  20.572 2.10e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03277 on 15 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 7.63e+11 on 4 and 15 DF,  p-value: < 2.2e-16

D)

Below are the forcasted values for quarters 21 and 22, the first two periods of the validation period:
# Print the forecasted sales for Q21 & Q22
validPeriodForecasts$mean[1:2]
##        1        2 
## 58793.71 60951.51

E)

Looking at the residuals, I would say that the model is likely to over-forecast compared to the real values. The traning period consists of 20 values, and the residuals show that we over-forecast 11 times (negative numbers) and under-forecast 9 times (positive numbers), so the model tends to over-forecast slightly more. However, if we plot the validation period residuals, we see that our model actually under-forecasts.
# Helps set up the plot
yrange3 = range(sales.ts)

# Set up the plot
plot(c(1, 5.75), yrange3, type="n", xlab="Year",  ylab="Sales (thousands)", bty="l", xaxt="n", yaxt="n")

# Add the time series
lines(salesTrain, bty="l")

# Add the x-axis
axis(1, at=seq(1,5.75,1), labels=format(seq(1,5.75,1)))

# Add the y-axis
axis(2, at=seq(40000,105000,10000), labels=format(seq(40,105,10)), las=2)

# Add fitted line from training period
lines(salesExpo$fitted, col="red")

# Add the legend
legend(1,100000, c("Sales", "Exponential Trend+Season"), lty=c(1,1,2), col=c("black", "red"), bty="n")

# Plot residuals using actual values instead of log-transformed units
plot(salesTrain - salesExpo$fitted, type="o", bty="l") 

# Add a line at y = 0 to help identitify positive and negative values
abline(h = 0, col = "red")

# Plot the residuals for the validation period
plot(salesValid - validPeriodForecasts$mean, type="o", bty="l")
abline(h = 0, col = "red")


F)

Based on the training period residual plot only, I would say that the regression model fits the data well because there are almost an equal number of points above and below y = 0 and the values stay in the range [-3000,3000]; the data points are not skewed in any direction and there are no discernable patterns.

G)

I would fit a quadratic trend model to Sales with Quarter and Quarter ^2.

Question 5

A)

There needs to be 13 predictors in all: 11 dummy variables for month, and t and t^2 for trend. This is because the data series has both trend and seasonality.
SouvenirSales <- read_excel("~/rProjects/Assignment_5/SouvenirSales.xlsx")

sales2.ts <- ts(SouvenirSales$Sales, start=c(1995,1), frequency=12)

validLength2 <- 12
trainLength2 <- length(sales2.ts) - validLength2

salesTrain2 <- window(sales2.ts, end=c(1995, trainLength2))
salesValid2 <- window(sales2.ts, start=c(1995,trainLength2+1))

B)

i)It appears that November and December have the highest average sales during the year. This is reasonable because many religious holidays occur at the end of the year - this means that people have time off work to visit the beach and gifts tend to play a big part of the holidays.
ii)The trend coefficient in model A indicates that sales are likely to increase by $245.36 each month - indicating an upward trend.
# Fit the model to the training set
salesLinearSeason2 <- tslm(salesTrain2 ~ trend + season)

# See the estimated regression equation
summary(salesLinearSeason2)
## 
## Call:
## tslm(formula = salesTrain2 ~ trend + season)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12592  -2359   -411   1940  33651 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3065.55    2640.26  -1.161  0.25029    
## trend         245.36      34.08   7.199 1.24e-09 ***
## season2      1119.38    3422.06   0.327  0.74474    
## season3      4408.84    3422.56   1.288  0.20272    
## season4      1462.57    3423.41   0.427  0.67077    
## season5      1446.19    3424.60   0.422  0.67434    
## season6      1867.98    3426.13   0.545  0.58766    
## season7      2988.56    3427.99   0.872  0.38684    
## season8      3227.58    3430.19   0.941  0.35058    
## season9      3955.56    3432.73   1.152  0.25384    
## season10     4821.66    3435.61   1.403  0.16573    
## season11    11524.64    3438.82   3.351  0.00141 ** 
## season12    32469.55    3442.36   9.432 2.19e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5927 on 59 degrees of freedom
## Multiple R-squared:  0.7903, Adjusted R-squared:  0.7476 
## F-statistic: 18.53 on 12 and 59 DF,  p-value: 9.435e-16

C)

i)Fitting a model to log(Sales) with a linear trend is equivalent to fitting a model to Sales (in dollars) with an exponential trend.
ii)The trend coefficient of model B indicates that monthly sales will increase by approximately 2.11% through time.
iii)The forecast for February 2002 is below; our model predicts that sales that month will be $17062.99. I had 84 time periods in this data series, so February of year beyond the validation period would be 86 which is why I multiplied my trend coefficient by 86.
# Regression Model with log(sales) - Model B
modelB <- tslm(log(salesTrain2) ~ trend + season)
summary(modelB)
## 
## Call:
## tslm(formula = log(salesTrain2) ~ trend + season)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4529 -0.1163  0.0001  0.1005  0.3438 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.646363   0.084120  90.898  < 2e-16 ***
## trend       0.021120   0.001086  19.449  < 2e-16 ***
## season2     0.282015   0.109028   2.587 0.012178 *  
## season3     0.694998   0.109044   6.374 3.08e-08 ***
## season4     0.373873   0.109071   3.428 0.001115 ** 
## season5     0.421710   0.109109   3.865 0.000279 ***
## season6     0.447046   0.109158   4.095 0.000130 ***
## season7     0.583380   0.109217   5.341 1.55e-06 ***
## season8     0.546897   0.109287   5.004 5.37e-06 ***
## season9     0.635565   0.109368   5.811 2.65e-07 ***
## season10    0.729490   0.109460   6.664 9.98e-09 ***
## season11    1.200954   0.109562  10.961 7.38e-16 ***
## season12    1.952202   0.109675  17.800  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1888 on 59 degrees of freedom
## Multiple R-squared:  0.9424, Adjusted R-squared:  0.9306 
## F-statistic:  80.4 on 12 and 59 DF,  p-value: < 2.2e-16
# Use the named coefficients
febForecast <- modelB$coefficients["(Intercept)"] + modelB$coefficients["trend"]*86 + modelB$coefficients["season2"]

# Convert them back and simply echo them
exp(febForecast)
## (Intercept) 
##    17062.99

D)

Model B is preferable for forecasting because:
1. The MAPE value for model B is considerably lower than model A; 15.52% verses 26.67%
2. The MAE/MAD (Mean Absolute Error/Deviation) for model B is almost half as model A, $5,191.67 verses $10,055.28
Both of the reasons above indicate that model B is more accurate and should be used to forecast this series.
modelAForecast <- forecast(salesLinearSeason2, h=validLength2)

accuracy(modelAForecast, salesValid2)
##                         ME      RMSE       MAE       MPE     MAPE     MASE
## Training set -5.684342e-14  5365.199  3205.089  6.967778 36.75088 0.855877
## Test set      8.251513e+03 17451.547 10055.276 10.533974 26.66568 2.685130
##                   ACF1 Theil's U
## Training set 0.4048039        NA
## Test set     0.3206228 0.9075924
modelBForecast <- forecast(modelB, h=validLength2)

# We have to tranform the forecasts back to the original scale.
accuracy(exp(modelBForecast$mean), salesValid2)
##                ME     RMSE      MAE      MPE    MAPE      ACF1 Theil's U
## Test set 4824.494 7101.444 5191.669 12.35943 15.5191 0.4245018 0.4610253

E)

1. I would plot the data separately by month in order to better identify and visualize monthly trends.
2. If daily sales data were available, I would look for correlations between daily sales and average daily temperature to see if that has an impact on sales. This may tell the owners that on colder days people tend to come inside to shop while they perfer to stay out on the beach on warmer days.
3. I may attempt a smoothing techinque, averaging the previous month, current month and next month to better see sales averages over the years.