#Loading necessary libraries:

library(forecast)
library(tidytext)
library(stringr)
library(ggplot2)
library(readr)
library(dplyr)
library(tidyr)
library(knitr)
library(pander)

Chapter 7: Regression-Based Models: Autocorrelation & External Information

For this assignment I will be completing the assigned problems (problems 2, 3, and 4) from Chapter 7 - Regression-Based Models: Autocorrelation & External Information in the Forecasting: Principles and Practice textbook by, Galit Schmueli and Kenneth C. Lichtendahl Jr.

Problem 2: Forecasting Wal-Mart Stock

First I will import the data set and look at the head, tail, and structure.

Walmart <- read.csv("WalMartStock.csv", stringsAsFactors = FALSE)
head(Walmart)
##        Date Close
## 1  5-Feb-01 53.84
## 2  6-Feb-01 53.20
## 3  7-Feb-01 54.66
## 4  8-Feb-01 52.30
## 5  9-Feb-01 50.40
## 6 12-Feb-01 53.45
tail(Walmart)
##          Date Close
## 243 28-Jan-02 58.63
## 244 29-Jan-02 57.91
## 245 30-Jan-02 59.75
## 246 31-Jan-02 59.98
## 247  1-Feb-02 59.26
## 248  4-Feb-02 58.90
str(Walmart)
## 'data.frame':    248 obs. of  2 variables:
##  $ Date : chr  "5-Feb-01" "6-Feb-01" "7-Feb-01" "8-Feb-01" ...
##  $ Close: num  53.8 53.2 54.7 52.3 50.4 ...

(a) Create a time plot of the differenced series

Figure 1. Time Series of Wal-Mart’s Closing Stock Prices

Walmart_TS <- ts(Walmart$Close, frequency=1)
yrange=range(Walmart_TS)

plot(c(0, 248), c(40,65), type="n", xlab="Day",  ylab="Walmart Closing Stock Price", bty="l", xaxt="n", yaxt="n")
axis(1, at=seq(1, 248, 20), labels=format(seq(1,248, 20), las=0))
axis(2, at=seq(40,65,2), labels=format(seq(40,65,2)), las=0)


lines(Walmart_TS)

autoplot(Walmart_TS)

Figure 2. ACF with a Lag.max = 1 of the Wal-Mart Time Series

Acf(Walmart_TS, lag.max = 1)

Figure 3. ACF of the Wal-Mart Time Series

Walmart_ACF <- Acf(Walmart_TS)

Walmart_ACF
## 
## Autocorrelations of series 'Walmart_TS', by lag
## 
##     0     1     2     3     4     5     6     7     8     9    10    11 
## 1.000 0.943 0.892 0.846 0.811 0.780 0.747 0.714 0.669 0.618 0.569 0.531 
##    12    13    14    15    16    17    18    19    20    21    22    23 
## 0.499 0.470 0.441 0.418 0.390 0.346 0.309 0.274 0.252 0.232 0.206 0.175
ggAcf(Walmart_TS)

Figure 4. Time Plot of the Differenced Series of Wal-Mart’s Closing Stock Prices

Diff_Walmart <- Acf(diff(Walmart_TS, lag=1), lag.max=12, main="ACF Plot for Differenced Series of Wal-Mart Closing Stock Prices")

Figure 4 shows us that the autocorrelations at lags, 1-12 all fall between the thresholds which would indicate that this series is a random walk.

Figure 5. Time Plot of the Differenced Series of Wal-Mart’s Closing Stock Prices

plot(diff(Walmart_TS, lag=1), bty="l")

library(tseries)
adf.test(Walmart_TS)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  Walmart_TS
## Dickey-Fuller = -2.6351, Lag order = 6, p-value = 0.3085
## alternative hypothesis: stationary

(b) Which of the following is/are relevant for testing whether this stock is a random walk?

The autocorrelations of the closing prices series
No

The AR(1) slope coefficient for the closing price series
Yes, the AR(1) slope coefficient for the closing price series is relevant for testing whether this stock is a random walk because to test whether a series is a random walk, we fit an AR(1) model and test the hypothesis that the slope coefficient is equal to 1.

The AR(1) constant coefficient for the closing price series
No

The autocorrelations of the differenced series
Yes, the autocorrelations of the differenced series are relevant for testing whether this stock is a random walk because if a random walk exists the ACF plot for the differenced series will show autocorrelations approximately equal to 0, or within the thresholds for the all the lags.

The AR(1) slope coefficient for the differenced series
No

The AR(1) constant coefficient for the differenced series
No

(c) Recreate the AR(1) model output for the Close price series shown in the left of Table 7.4. Does the AR model indicate that this is a random walk? Explain how you reached your conclusion.

fitWalmart <- arima(Walmart_TS, order=c(1,0,0))
fitWalmart
## 
## Call:
## arima(x = Walmart_TS, order = c(1, 0, 0))
## 
## Coefficients:
##          ar1  intercept
##       0.9558    52.9497
## s.e.  0.0187     1.3280
## 
## sigma^2 estimated as 0.9735:  log likelihood = -349.8,  aic = 705.59

In order to determine if this AR model indicates a random walk we need to look at the p-value. A p-value less than the selected alpha suggested the series is not a random walk, while a p-value greater than alpha suggests the series is a random walk.

Using the equation from page 153 and the formula from the notes with the slope coefficient and its standard error to compute the test statistic and then feed it to the t-distribution to get a “conservative” estimate and then use the normal distribution.

#Calculate the two-tailed p-value from a t-distribution
2*pt(-abs((1 - fitWalmart$coef["ar1"])/sqrt(diag(vcov(fitWalmart)))["ar1"]), df=length(Walmart_TS)-1)
##        ar1 
## 0.01889832
2*pnorm(-abs((1 - fitWalmart$coef["ar1"])/sqrt(diag(vcov(fitWalmart)))["ar1"]))
##        ar1 
## 0.01812284

We are testing to see if the slope coefficient is equal to 0, the two-tailed p-value from the t-distribution is approximately 0.018898, while the p-value using the normal distribution is approximately 0.018123, which indicates that if \(\alpha\) =0.05 then the slope coefficients are significant. This indicates that there is evidence to reject the null hypothesis (that the slope coefficient is equal to 1) and suggests the AR model does not indicate a random walk.

(d) What are the implications of finding that a time series is a random walk? Choose the correct statement(s) below.

It is impossible to obtain useful forecasts of the series

If a random walk exists for the time series the forecast is as useful as a naive forecast.

The series is random
The series may not be random but the changes from one period to another are random

The changes in the series from one period to the other are random
The changes from one time period to the next are random.

Problem 3: Souvenir Sales

SS <- read.csv("SouvenirSales.csv", stringsAsFactors = FALSE)
SS_TS <- ts(SS$Sales, start=c(1995,1), frequency=12)

Figure 6. Plot of Souvenir Sales

autoplot(SS_TS)

head(SS_TS)
##          Jan     Feb     Mar     Apr     May     Jun
## 1995 1664.81 2397.53 2840.71 3547.29 3752.96 3714.74
tail(SS_TS)
##            Jul       Aug       Sep       Oct       Nov       Dec
## 2001  26155.15  28586.52  30505.41  30821.33  46634.38 104660.67
str(SS_TS)
##  Time-Series [1:84] from 1995 to 2002: 1665 2398 2841 3547 3753 ...
#Breaking out the training vs. validation sets on the Souvenir Sales dataset.
ValidLen <- 12
TrainLen <- length(SS_TS) - ValidLen

SS_Train <- window(SS_TS, start=c(1995,1), end=c(1995, TrainLen))

SS_Valid <- window(SS_TS, start=c(1995, TrainLen+1), end=c(1995, TrainLen+ValidLen))

(a) Run a regression model with log(Sales) as the output variable and with a linear trend and monthly predictors. Remember to fit only the training period. Use this model to forecast the sales in February 2002.

SS_Log <- tslm(log(SS_Train) ~trend + season)
summary(SS_Log)
## 
## Call:
## tslm(formula = log(SS_Train) ~ trend + season)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4529 -0.1163  0.0001  0.1005  0.3438 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.646363   0.084120  90.898  < 2e-16 ***
## trend       0.021120   0.001086  19.449  < 2e-16 ***
## season2     0.282015   0.109028   2.587 0.012178 *  
## season3     0.694998   0.109044   6.374 3.08e-08 ***
## season4     0.373873   0.109071   3.428 0.001115 ** 
## season5     0.421710   0.109109   3.865 0.000279 ***
## season6     0.447046   0.109158   4.095 0.000130 ***
## season7     0.583380   0.109217   5.341 1.55e-06 ***
## season8     0.546897   0.109287   5.004 5.37e-06 ***
## season9     0.635565   0.109368   5.811 2.65e-07 ***
## season10    0.729490   0.109460   6.664 9.98e-09 ***
## season11    1.200954   0.109562  10.961 7.38e-16 ***
## season12    1.952202   0.109675  17.800  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1888 on 59 degrees of freedom
## Multiple R-squared:  0.9424, Adjusted R-squared:  0.9306 
## F-statistic:  80.4 on 12 and 59 DF,  p-value: < 2.2e-16

After building the regression model above, this model is then used to forecast the sales in February 2002. The time period 86 is used to forecast 2 period beyond the time series and season2 is used for February.

FebForecast <- SS_Log$coefficients["(Intercept)"] + SS_Log$coefficients["trend"]*86 + SS_Log$coefficients["season2"]
exp(FebForecast)
## (Intercept) 
##    17062.99

The forecast for February 2002 based on the training set is 17,062.99 Australian dollars.

(b) Create an ACF plot until lag-15 for the forecast errors. Now fit an AR model with lag-2 [ARIMA(2,0,0)] to the forecast errors.

Figure 7. ACF Plot of Souvenir Sales Residuals (using the log(Sales))

SS_ACF <- Acf(SS_Log$residuals, lag.max=15)

SS_ACF
## 
## Autocorrelations of series 'SS_Log$residuals', by lag
## 
##      0      1      2      3      4      5      6      7      8      9 
##  1.000  0.459  0.485  0.194  0.088  0.154  0.016  0.030  0.106  0.034 
##     10     11     12     13     14     15 
##  0.152 -0.055 -0.012 -0.047 -0.077 -0.023
i. Examining the ACF plot and the estimated coefficients of the AR(2) model (and their statistical significance), what can we learn about the regression forecasts?
SS_ARModel2 <- Arima(SS_Log$residuals, order=c(2,0,0))
SS_ARModel2
## Series: SS_Log$residuals 
## ARIMA(2,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1     ar2     mean
##       0.3072  0.3687  -0.0025
## s.e.  0.1090  0.1102   0.0489
## 
## sigma^2 estimated as 0.0205:  log likelihood=39.03
## AIC=-70.05   AICc=-69.46   BIC=-60.95
#Calculating the t statistics for each: coefficient / s.e.
#>2 (or <-2) is statistically significant

SS_ARModel2$coef["ar1"] /sqrt(diag(vcov(SS_ARModel2)))["ar1"]
##      ar1 
## 2.819441

Statistically significant

S.E of AR(1)

sqrt(diag(vcov(SS_ARModel2)))["ar1"]
##       ar1 
## 0.1089629
#Calculating the t statistics for each: coefficient / s.e.
#>2 (or <-2) is statistically significant
SS_ARModel2$coef["ar2"] / sqrt(diag(vcov(SS_ARModel2)))["ar2"]
##      ar2 
## 3.346371

Statistically significant

S.E. of AR(2)

sqrt(diag(vcov(SS_ARModel2)))["ar2"]
##       ar2 
## 0.1101939
#Estimating p-values based on normal distribution for AR(1)

2*pnorm(-abs(SS_ARModel2$coef["ar1"]/0.1089629))
##         ar1 
## 0.004810732

The p-value for the AR(1) coefficient is 0.004810732.

#Estimating p-values based on normal distribution for AR(2)

2*pnorm(-abs(SS_ARModel2$coef["ar2"]/0.1101939))
##          ar2 
## 0.0008187679

The p-value for the AR(2) coefficient is 0.0008187669.

Figure 8. ACF Plot of the Residuals of the AR(2) Model

Acf(SS_ARModel2$residuals)

Figure 7 shows us that there is positive correlation between AR(1) and AR(2) and that if we account for autocorrelation in the residuals we will be able to improve the forecast.

After looking at the calculated p-values, we see that both AR(1) coefficient and AR(2) coefficient are statistically significant. The AR(2) model (or AR with a lag of 2) seems to be the more appropriate choice because the p-value is smaller and therefore more statistically significant, this means that the regression forecasts that are 2 time periods apart are correlated moreso than those 1 time period apart.

ii. Use the autocorrelation information to compute a forecast for January 2002, using the regression model and the AR(2) model above.

Using the regression model and the AR(2) model, a forecast for January 2002 can be developed.

LinearRegressionForecast <- forecast(SS_Log, h=ValidLen)
LinearRegressionForecast
##          Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
## Jan 2001       9.188097  8.917220  9.458974  8.769890  9.606304
## Feb 2001       9.491232  9.220354  9.762109  9.073024  9.909439
## Mar 2001       9.925335  9.654457 10.196212  9.507127 10.343542
## Apr 2001       9.625329  9.354452  9.896207  9.207122 10.043537
## May 2001       9.694286  9.423408  9.965163  9.276078 10.112493
## Jun 2001       9.740741  9.469864 10.011619  9.322534 10.158949
## Jul 2001       9.898195  9.627318 10.169072  9.479988 10.316402
## Aug 2001       9.882831  9.611954 10.153708  9.464624 10.301038
## Sep 2001       9.992619  9.721742 10.263496  9.574412 10.410826
## Oct 2001      10.107664  9.836787 10.378542  9.689457 10.525872
## Nov 2001      10.600248 10.329370 10.871125 10.182040 11.018455
## Dec 2001      11.372615 11.101738 11.643493 10.954408 11.790823
ErrorForecast <- forecast(SS_ARModel2, h=ValidLen)
ErrorForecast
##          Point Forecast       Lo 80     Hi 80      Lo 95     Hi 95
## Jan 2001    0.107882119 -0.07561892 0.2913832 -0.1727585 0.3885227
## Feb 2001    0.098551352 -0.09341395 0.2905167 -0.1950342 0.3921370
## Mar 2001    0.069245043 -0.14069093 0.2791810 -0.2518243 0.3903144
## Apr 2001    0.056801003 -0.15830920 0.2719112 -0.2721817 0.3857837
## May 2001    0.042171322 -0.17774923 0.2620919 -0.2941681 0.3785108
## Jun 2001    0.033088136 -0.18905521 0.2552315 -0.3066508 0.3728271
## Jul 2001    0.024902959 -0.19881529 0.2486212 -0.3172446 0.3670505
## Aug 2001    0.019038932 -0.20554500 0.2436229 -0.3244325 0.3625104
## Sep 2001    0.014219137 -0.21092154 0.2393598 -0.3301038 0.3585421
## Oct 2001    0.010576068 -0.21489090 0.2360430 -0.3342459 0.3553980
## Nov 2001    0.007679567 -0.21798999 0.2333491 -0.3374522 0.3528114
## Dec 2001    0.005446339 -0.22034478 0.2312375 -0.3398714 0.3507641

After creating forecasts for both the regression model and one using the residuals AR(2) model, we can combine them to forecast an adjusted forecast using both models.

adjustedForecast <- LinearRegressionForecast$mean + ErrorForecast$mean
adjustedForecast
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 2001  9.295979  9.589783  9.994580  9.682130  9.736457  9.773830  9.923098
##            Aug       Sep       Oct       Nov       Dec
## 2001  9.901870 10.006838 10.118240 10.607927 11.378062
exp(adjustedForecast)
##           Jan      Feb      Mar      Apr      May      Jun      Jul
## 2001 10894.13 14614.70 21907.40 16028.61 16923.47 17567.92 20396.07
##           Aug      Sep      Oct      Nov      Dec
## 2001 19967.68 22177.61 24791.11 40454.26 87383.49

The improved sales forecast for souvenirs for January 2001 is 10,894.13 Australian dollars.

Figure 9. Adjusted Forecast vs. Validation Period for Souvenir Sales

plot(SS_Valid,xlab="Month in 2001",ylab="Sales (in thousands of Australian dollars)",bty="l",xaxt="n",yaxt="n",lwd=2)
axis(1, at=seq(2001,2001+11/12,1/12), labels=c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
axis(2,at=seq(0,110000,10000),labels=format(seq(0,110,10)),las=2)
lines(exp(adjustedForecast),col="Red",lwd=1.5)
legend(2001,110000, c("Validation Period", "Adjusted Forecast"), lty=c(1,1), col=c("black", "Red"), bty="n")

Problem 4: Shipments of Household Appliances

Appliances <- read.csv("ApplianceShipments.csv", stringsAsFactors = FALSE)
head(Appliances)
##   Quarter Shipments
## 1 Q1-1985      4009
## 2 Q2-1985      4321
## 3 Q3-1985      4224
## 4 Q4-1985      3944
## 5 Q1-1986      4123
## 6 Q2-1986      4522
tail(Appliances)
##    Quarter Shipments
## 15 Q3-1988      4417
## 16 Q4-1988      4258
## 17 Q1-1989      4245
## 18 Q2-1989      4900
## 19 Q3-1989      4585
## 20 Q4-1989      4533
str(Appliances)
## 'data.frame':    20 obs. of  2 variables:
##  $ Quarter  : chr  "Q1-1985" "Q2-1985" "Q3-1985" "Q4-1985" ...
##  $ Shipments: int  4009 4321 4224 3944 4123 4522 4657 4030 4493 4806 ...

Figure 10. Number of Shipments of Appliances by Quarter

Appliances_TS <- ts(Appliances$Shipments, start=c(1985,1), frequency=4)
autoplot(Appliances_TS)

(a) If we compute the autocorrelation of the series, which lag (>0) is most likely to have the largest coefficient (in absolute value)?

If we assume quarterly seasonality exists then lag-4 autocorrelation would have the largest coefficient.

(b) Create an ACF plot and compare it with your answer.

Figure 11. ACF Plot of Appliance Shipments

AppliancesACF <- Acf(Appliances_TS)

AppliancesACF
## 
## Autocorrelations of series 'Appliances_TS', by lag
## 
##      0      1      2      3      4      5      6      7      8      9 
##  1.000  0.261 -0.098  0.164  0.387 -0.030 -0.269  0.081  0.086 -0.168 
##     10     11     12     13 
## -0.325 -0.019  0.047 -0.096

As Figure 11 and the actual values of the coefficients shows us, the largest coefficient value is at lag-4.