knitr::opts_chunk$set(echo = TRUE)
#  Load libraries and set environment options
library(knitr)
library(forecast)
library(ggplot2)
library(fpp)
library(readxl)
#library(dplyr)
#library(tidyr)#library(zoo)

#  Use this option to supress scientific notation in printing values
options(scipen = 10, digits = 2)

Assignment 6, Problem 2

Analysis of Canadian Manufacturing Workers Work-Hours: The time series plot in Figure 6.10 describes the average annual number of weekly hours spent by Canadian manufacturing workers.

Which one model of the following regression-based models would fit the series best?

- Linear trend model

- Linear trend model with seasonality

- Quadradratic trend model

- Quadradratic trend model with seasonality

#  Create a times series and linear model, then plot the data
work_hours.ts<-ts(work_hours$`Hours per week`, start=1966)
work_hours.linear<- tslm(work_hours.ts ~ trend, lamda=1)
work_hours.quad<- tslm(work_hours.ts ~ trend + I(trend^2))
summary(work_hours.linear)
## 
## Call:
## tslm(formula = work_hours.ts ~ trend, lamda = 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2046 -0.2876  0.0478  0.3021  1.2319 
## 
## Coefficients:
##             Estimate Std. Error t value    Pr(>|t|)    
## (Intercept) 37.41613    0.17582   212.8     < 2e-16 ***
## trend       -0.06137    0.00852    -7.2 0.000000029 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.51 on 33 degrees of freedom
## Multiple R-squared:  0.611,  Adjusted R-squared:   0.6 
## F-statistic: 51.9 on 1 and 33 DF,  p-value: 0.0000000293
summary(work_hours.quad)
## 
## Call:
## tslm(formula = work_hours.ts ~ trend + I(trend^2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9450 -0.2096 -0.0165  0.3186  0.6016 
## 
## Coefficients:
##              Estimate Std. Error t value   Pr(>|t|)    
## (Intercept) 38.164416   0.217660  175.34    < 2e-16 ***
## trend       -0.182715   0.027879   -6.55 0.00000022 ***
## I(trend^2)   0.003371   0.000751    4.49 0.00008760 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.40 on 32 degrees of freedom
## Multiple R-squared:  0.761,  Adjusted R-squared:  0.747 
## F-statistic: 51.1 on 2 and 32 DF,  p-value: 0.00000000011
# Plot the time series with the linear and quadratic trend lines
yrange = range(work_hours.ts)
plot(c(1965, 2000), yrange, type="n", xlab="YEAR", ylab="Average Weekly Hours", bty="l", xaxt="n", yaxt="n")

lines(work_hours.ts, bty="l")
axis(1, at=seq(1970, 2000, 5), labels=format(seq(1970,2000,5)))
axis(2, at=seq(32,39, 1), labels=format(seq(32, 39, 1)), las=0)

lines(work_hours.linear$fitted, col="blue")
lines(work_hours.quad$fitted, col="green")

legend(1985, 37.5, c("Actual", "Linear", "Quadratic"), lty=c(1,1,1), col=c("black", "blue", "green"), bty="n")

Given that the data are yearly averages, seasonality would not be an option, leaving a choice between linear and quadratic regression. The p-value is low for both the linear and quadratic models, but the quadratic model is significantly lower p-value and higher r-squared. Therefore I believe the quadratic model is a better fit. When running the model and trend line against the data, the quatratic trend line does appear to be a better fit.

Assignment 6, Problem 4

Forecasting Department Store Sales: The time series plot shown in Figure 6.12 describes actual quarterly sales for a department store over a 6-year period.

# Take a quick look at the time series

plot(dept_sales.ts, type="b", xlab="Year", ylab="Sales (in thousands)", bty="l", xaxt="n", yaxt="n")

axis(1, at=seq(1, 6, 1), labels=format(seq(1, 6, 1)))
axis(2, at=seq(40000, 120000, 20000), labels=format(seq(40, 120, 20)), las=0)

(a) The forecaster decided that there is an exponential trend in the series. In order to fit a regression-based model that accounts for this trend, which of the following operations must be performed (either manually or by a function in R)?

- Take a logarithm of the Quarter index

- Take a logarithm of sales.

- Take an exponent of sales

- Take an exponent of Quarter index

In order to fit the exponential trend to the model, the forecaster must take a log of the sales value. This is done using the lambda=0 option when using the tslm function in R to create a linear model.

(b) Fit a regression model with an exponential trend and seasonality, using only the first 20 quarters as the training period (remember to first partition the series into training and validation periods).

#  Create training and validation periods
validLen <- 4
trainLen <- length(dept_sales.ts) - validLen
dept_sales_train <- window(dept_sales.ts, end=c(1, trainLen))
dept_sales_valid <- window(dept_sales.ts, start=c(1, trainLen+1))
#  Fit an exponential trend with seasonality model to the training set
dept_salesExpo<- tslm(dept_sales_train ~ trend + season, lambda = 0)

summary(dept_salesExpo)
## 
## Call:
## tslm(formula = dept_sales_train ~ trend + season, lambda = 0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.05352 -0.01320 -0.00453  0.01439  0.06268 
## 
## Coefficients:
##             Estimate Std. Error t value        Pr(>|t|)    
## (Intercept)  10.7489     0.0187  574.06         < 2e-16 ***
## trend         0.0111     0.0013    8.56 0.0000003702488 ***
## season2       0.0250     0.0208    1.20            0.25    
## season3       0.1653     0.0209    7.92 0.0000009788403 ***
## season4       0.4338     0.0211   20.57 0.0000000000021 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.033 on 15 degrees of freedom
## Multiple R-squared:     1,   Adjusted R-squared:     1 
## F-statistic: 7.63e+11 on 4 and 15 DF,  p-value: <2e-16

(c) A partial output is shown in Table 6.7. From the output, after adjusting for trend, are Q2 average sales higher, lower, or approximately equal to the average Q1 sales?

Q2 is ever-so-slightly higher than Q1, and could be considered approximately equal when taking trend and standard error into consideration.

(d) Use this model to forecast sales in quarters 21 and 22.

Exonential forecast for validation period quarters 21 & 22.

#  Create a forecast for the validation period
#   Quadratic
dept_sales_validForecast<- forecast(dept_salesExpo, h=2)
dept_sales_validForecast
##      Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 6 Q1          58794 55790 61959 54091 63905
## 6 Q2          60952 57838 64233 56076 66251

(e) The plots shown in Figure 6.13 describe the fit (top) and forecast errors (bottom) from this regression model.

i. Recreate these plots.

# Plot the time series with the exponential trend line
yrange = range(dept_sales.ts)

plot(c(1, 6), yrange, type="n",xlab="Quarter", ylab="Sales (in thousands)", bty="l", xaxt="n", yaxt="n")

lines(dept_sales.ts, bty="l")
axis(1, at=seq(1, 6, 1), labels=format(seq(1, 6, 1)))
axis(2, at=seq(40000, 120000, 20000), labels=format(seq(40, 120, 20)), las=2)

lines(dept_salesExpo$fitted, col="red")

legend(1, 105000, c("Sales", "Exponential + Seasonal Trend"), lty=c(1, 1), col=c("black", "red"), bty="n")

# Plot the residuals
#plot(dept_salesExpo$residuals, type="o", bty="l")
plot(dept_sales_train-dept_salesExpo$fitted, type="o", bty="l")

ii. Based on these plots, what can you say about your forecasts for Q21 and Q22? Are they likely to over-forecast, under_forecast, or be reasonably close to the real sales values?

Looking at the plot it would be reasonable to guess that the forecast for Q21 & Q22 will be reasonably close to the actual sales value.

(f) Look at the residual plot, which of the following statements appear true?

- Seasonality is not captured well.

- The regression model fits the data well.

- The trend in the data is not captured well by the model.

If I had to select one of these statements it would be that the trend in the data is not captured well by the model. The seasonality is captured well, and the model fits the data well, but if there is room for improvement it would be in the trend.

(g) Which of the following solutions is adequate and parsimonious solution for improving model fit?

- Fit a quadratic trend model to the residuals (with Quarter and Quarter^2.)

- Fit a quadratic trend model to Sales (with Quarter and Quarter^2.)

I would try fitting a quadratic trend model to sales (with Quarter and Quarter^2.)

Assignment 6, Problem 5

* Souvenir Sales: The file SouvenirSales.xls contains monthly sales for a souvenir shop at a beach resort town in Queensland, Australia, between 1995 and 2001.*

* Back in 2001, the store wanted to use the data to forecast sales for the next 12 months (year 2002). They hired an analyst to generate forecasts. The analyst first partitioned the data into training and validation periods, with the validation set containing the last 12 months of data (year 2001). She then fit a regression model to sales, using the training period. *

Read in the raw data from the xls file.

(a) Based on the two time plots in FIgure 6.14, which predictors should be included in the regression model? What is the total number of predictors in the model?

Looking at the two plots it appears that a linear model with seasonality would be the best model to use. There should be 3 predictors, trend, I(trend^2), and season.

(b) Run a regression model with Sales (in Australian dollars) as the output variable and with a linear trend and monthly predictors. Remember to fit only the training period. Call this model A.

#  Create a times series
s_sales.ts<-ts(s_sales$`Sales`, start=1995, frequency=12)
plot(s_sales.ts)

#  Create training and validation periods
validLen <- 12
trainLen <- length(s_sales.ts) - validLen
s_sales_train <- window(s_sales.ts, end=c(1995, trainLen))
s_saless_valid <- window(s_sales.ts, start=c(1995, trainLen+1))
#  Fit a linear trend with seasonality model to the training set
s_salesA<- tslm(s_sales_train ~ trend + I(trend^2) + season)

summary(s_salesA)
## 
## Call:
## tslm(formula = s_sales_train ~ trend + I(trend^2) + season)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13610  -1758    176   2288  28154 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2921.06    2715.16    1.08  0.28646    
## trend        -250.13     120.05   -2.08  0.04162 *  
## I(trend^2)      6.79       1.59    4.26  7.5e-05 ***
## season2      1187.26    3011.81    0.39  0.69488    
## season3      4531.02    3012.35    1.50  0.13797    
## season4      1625.47    3013.20    0.54  0.59164    
## season5      1636.25    3014.34    0.54  0.58933    
## season6      2071.60    3015.73    0.69  0.49486    
## season7      3192.19    3017.37    1.06  0.29447    
## season8      3417.63    3019.26    1.13  0.26232    
## season9      4118.46    3021.41    1.36  0.17812    
## season10     4943.83    3023.83    1.63  0.10747    
## season11    11592.51    3026.56    3.83  0.00032 ***
## season12    32469.55    3029.64   10.72  2.2e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5220 on 58 degrees of freedom
## Multiple R-squared:  0.84,   Adjusted R-squared:  0.805 
## F-statistic: 23.5 on 13 and 58 DF,  p-value: <2e-16

i. Examine the coefficients: Which month tends to have the highest average sales during the year? Why is this reasonable?

The summary output from linear regression model A indicates that season 12, December, has the highest average sales, with season 11, November, following close behind. This makes sense as this is a seaside resort and November & December would be high tourist season in Queensland.

ii. What does the estimated trend coefficient of model A mean?

The estimated trend coefficient is the average difference in sales for the period, which is January.

(c) Run a regression model with log(Sales) as the output variable and with a linear trend and monthly predictors. Remember to fit only the training period. Call this mode B.

i. Fitting a model to log(Sales) with a linear trend is equivalent to fitting a model to Sales (in dollars) with what type of trend?

#  Fit a linear trend with seasonality model  with a logarithm to the training set
s_salesB<- tslm(log(s_sales_train) ~ trend + I(trend^2) + season)

summary(s_salesB)
## 
## Call:
## tslm(formula = log(s_sales_train) ~ trend + I(trend^2) + season)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4569 -0.1156 -0.0022  0.1024  0.3402 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.65145164 0.09912593   77.19  < 2e-16 ***
## trend       0.02069847 0.00438279    4.72  1.5e-05 ***
## I(trend^2)  0.00000577 0.00005813    0.10  0.92128    
## season2     0.28207257 0.10995613    2.57  0.01292 *  
## season3     0.69510212 0.10997594    6.32  4.0e-08 ***
## season4     0.37401187 0.11000707    3.40  0.00123 ** 
## season5     0.42187153 0.11004841    3.83  0.00031 ***
## season6     0.44721921 0.11009922    4.06  0.00015 ***
## season7     0.58355294 0.11015911    5.30  1.9e-06 ***
## season8     0.54705825 0.11022808    4.96  6.4e-06 ***
## season9     0.63570352 0.11030646    5.76  3.3e-07 ***
## season10    0.72959434 0.11039498    6.61  1.3e-08 ***
## season11    1.20101178 0.11049471   10.87  1.3e-15 ***
## season12    1.95220222 0.11060709   17.65  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.19 on 58 degrees of freedom
## Multiple R-squared:  0.942,  Adjusted R-squared:  0.929 
## F-statistic:   73 on 13 and 58 DF,  p-value: <2e-16

ii. What does the estimated trend coefficient of model B mean?

The estimated trend coefficient in model B represents the percentage increase in sales.

iii. Use this model to forecast the sales in February 2002.

#  Fit a linear trend with seasonality model  with a logarithm to the full set
s_salesBFull<- tslm(log(s_sales.ts) ~ trend + I(trend^2) + season)

summary(s_salesBFull)
## 
## Call:
## tslm(formula = log(s_sales.ts) ~ trend + I(trend^2) + season)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4824 -0.1152 -0.0007  0.1042  0.3710 
## 
## Coefficients:
##              Estimate Std. Error t value      Pr(>|t|)    
## (Intercept) 7.6873287  0.0883781   86.98       < 2e-16 ***
## trend       0.0166223  0.0033346    4.98 0.00000431898 ***
## I(trend^2)  0.0000679  0.0000380    1.79        0.0783 .  
## season2     0.2517226  0.0978287    2.57        0.0122 *  
## season3     0.6964286  0.0978410    7.12 0.00000000076 ***
## season4     0.3845634  0.0978605    3.93        0.0002 ***
## season5     0.4098953  0.0978868    4.19 0.00008084244 ***
## season6     0.4489992  0.0979195    4.59 0.00001929771 ***
## season7     0.6102523  0.0979584    6.23 0.00000003066 ***
## season8     0.5872533  0.0980034    5.99 0.00000008088 ***
## season9     0.6679740  0.0980549    6.81 0.00000000274 ***
## season10    0.7452556  0.0981130    7.60 0.00000000010 ***
## season11    1.2036953  0.0981784   12.26       < 2e-16 ***
## season12    1.9581366  0.0982516   19.93       < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.18 on 70 degrees of freedom
## Multiple R-squared:  0.955,  Adjusted R-squared:  0.946 
## F-statistic:  114 on 13 and 70 DF,  p-value: <2e-16
#  Forecast February 2002
feb_forecast<- s_salesBFull$coefficients["(Intercept)"] + s_salesBFull$coefficients["trend"]*86 +  s_salesBFull$coefficients["I(trend^2)"]*86 + s_salesBFull$coefficients["season2"]
exp(feb_forecast)
## (Intercept) 
##       11783

(d) Compare the two regression models (A and B) in terms of forecast performance. Which model is preferable for forecasting? Mention at least two reasons based on the information in the outputs.

Based on the output of the accuracy function, the MAPE and RMSE values on model B are better. Also, when plotting both models, model B seems to perform better. The quadratic + seasonal model that I chose may not be the best regression model for this data. Looking at the residuals for model A, this model consistently underperforms, and model B consistently overperforms. If I had more time I would explore others.
#  Run the forecast on the validation period using each model
s_salesAForecast<- forecast(s_salesA, h=s_saless_valid)
s_salesBForecast<- forecast(s_salesB, h=s_saless_valid)
accuracy(s_salesAForecast$mean, s_sales.ts)
##            ME  RMSE  MAE MPE MAPE ACF1 Theil's U
## Test set -871 13934 8956 -29   38 0.26         1
accuracy(exp(s_salesBForecast$mean), s_sales.ts)
##            ME RMSE  MAE MPE MAPE ACF1 Theil's U
## Test set 4603 6811 5002  12   15 0.43      0.45
#   Plot the time series with the quadratic trend line
yrange = range(s_sales.ts)

plot(c(1995, 2002), yrange, type="n",xlab="Month/Year", ylab="Sales (in hundreds)", bty="l", xaxt="n", yaxt="n")

lines(s_sales.ts, bty="l")
axis(1, at=seq(1995, 2002, 1), labels=format(seq(1995, 2002, 1)))
axis(2, at=seq(0, 120000, 20000), labels=format(seq(0, 1200, 200)), las=2)

lines(s_salesAForecast$fitted, col="red")
lines(exp(s_salesBForecast$fitted), col="Blue")

legend(1995, 100000, c("Sales", "Model A", "Model B"), lty=c(1, 1), col=c("black", "red", "blue"), bty="n")

# Plot the residuals
plot(s_salesAForecast$residuals, type="o", bty="l")

plot(exp(s_salesBForecast$residuals), type="o", bty="l")

(e) How would you model this data differently if the goal was understanding the different components of sales in the sourvenir shop between 1995 and 2001? Mention two differences.

If I were only looking at a descriptive model I would not partition the data and I would fit a model to the entire time series.