#Loading necessary libraries:
library(forecast)
library(tidytext)
library(stringr)
library(ggplot2)
library(readr)
library(dplyr)
library(tidyr)
library(knitr)
library(pander)
For this assignment I will be completing the assigned problems (problems 2, 3, and 4) from Chapter 7 - Regression-Based Models: Autocorrelation & External Information in the Forecasting: Principles and Practice textbook by, Galit Schmueli and Kenneth C. Lichtendahl Jr.
First I will import the data set and look at the head, tail, and structure.
Walmart <- read.csv("WalMartStock.csv", stringsAsFactors = FALSE)
head(Walmart)
## Date Close
## 1 5-Feb-01 53.84
## 2 6-Feb-01 53.20
## 3 7-Feb-01 54.66
## 4 8-Feb-01 52.30
## 5 9-Feb-01 50.40
## 6 12-Feb-01 53.45
tail(Walmart)
## Date Close
## 243 28-Jan-02 58.63
## 244 29-Jan-02 57.91
## 245 30-Jan-02 59.75
## 246 31-Jan-02 59.98
## 247 1-Feb-02 59.26
## 248 4-Feb-02 58.90
str(Walmart)
## 'data.frame': 248 obs. of 2 variables:
## $ Date : chr "5-Feb-01" "6-Feb-01" "7-Feb-01" "8-Feb-01" ...
## $ Close: num 53.8 53.2 54.7 52.3 50.4 ...
Walmart_TS <- ts(Walmart$Close, frequency=1)
yrange=range(Walmart_TS)
plot(c(0, 248), c(40,65), type="n", xlab="Day", ylab="Walmart Closing Stock Price", bty="l", xaxt="n", yaxt="n")
axis(1, at=seq(1, 248, 20), labels=format(seq(1,248, 20), las=0))
axis(2, at=seq(40,65,2), labels=format(seq(40,65,2)), las=0)
lines(Walmart_TS)
autoplot(Walmart_TS)
Acf(Walmart_TS, lag.max = 1)
Walmart_ACF <- Acf(Walmart_TS)
Walmart_ACF
##
## Autocorrelations of series 'Walmart_TS', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 1.000 0.943 0.892 0.846 0.811 0.780 0.747 0.714 0.669 0.618 0.569 0.531
## 12 13 14 15 16 17 18 19 20 21 22 23
## 0.499 0.470 0.441 0.418 0.390 0.346 0.309 0.274 0.252 0.232 0.206 0.175
ggAcf(Walmart_TS)
Diff_Walmart <- Acf(diff(Walmart_TS, lag=1), lag.max=12, main="ACF Plot for Differenced Series of Wal-Mart Closing Stock Prices")
Figure 4 shows us that the autocorrelations at lags, 1-12 all fall between the thresholds which would indicate that this series is a random walk.
plot(diff(Walmart_TS, lag=1), bty="l")
library(tseries)
adf.test(Walmart_TS)
##
## Augmented Dickey-Fuller Test
##
## data: Walmart_TS
## Dickey-Fuller = -2.6351, Lag order = 6, p-value = 0.3085
## alternative hypothesis: stationary
The autocorrelations of the closing prices series
No
The AR(1) slope coefficient for the closing price series
Yes, the AR(1) slope coefficient for the closing price series is relevant for testing whether this stock is a random walk because to test whether a series is a random walk, we fit an AR(1) model and test the hypothesis that the slope coefficient is equal to 1.
The AR(1) constant coefficient for the closing price series
No
The autocorrelations of the differenced series
Yes, the autocorrelations of the differenced series are relevant for testing whether this stock is a random walk because if a random walk exists the ACF plot for the differenced series will show autocorrelations approximately equal to 0, or within the thresholds for the all the lags.
The AR(1) slope coefficient for the differenced series
No
The AR(1) constant coefficient for the differenced series
No
fitWalmart <- arima(Walmart_TS, order=c(1,0,0))
fitWalmart
##
## Call:
## arima(x = Walmart_TS, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.9558 52.9497
## s.e. 0.0187 1.3280
##
## sigma^2 estimated as 0.9735: log likelihood = -349.8, aic = 705.59
In order to determine if this AR model indicates a random walk we need to look at the p-value. A p-value less than the selected alpha suggested the series is not a random walk, while a p-value greater than alpha suggests the series is a random walk.
Using the equation from page 153 and the formula from the notes with the slope coefficient and its standard error to compute the test statistic and then feed it to the t-distribution to get a “conservative” estimate and then use the normal distribution.
#Calculate the two-tailed p-value from a t-distribution
2*pt(-abs((1 - fitWalmart$coef["ar1"])/sqrt(diag(vcov(fitWalmart)))["ar1"]), df=length(Walmart_TS)-1)
## ar1
## 0.01889832
2*pnorm(-abs((1 - fitWalmart$coef["ar1"])/sqrt(diag(vcov(fitWalmart)))["ar1"]))
## ar1
## 0.01812284
We are testing to see if the slope coefficient is equal to 0, the two-tailed p-value from the t-distribution is approximately 0.018898, while the p-value using the normal distribution is approximately 0.018123, which indicates that if \(\alpha\) =0.05 then the slope coefficients are significant. This indicates that there is evidence to reject the null hypothesis (that the slope coefficient is equal to 1) and suggests the AR model does not indicate a random walk.
It is impossible to obtain useful forecasts of the series
If a random walk exists for the time series the forecast is as useful as a naive forecast.
The series is random
The series may not be random but the changes from one period to another are random
The changes in the series from one period to the other are random
The changes from one time period to the next are random.
SS <- read.csv("SouvenirSales.csv", stringsAsFactors = FALSE)
SS_TS <- ts(SS$Sales, start=c(1995,1), frequency=12)
autoplot(SS_TS)
head(SS_TS)
## Jan Feb Mar Apr May Jun
## 1995 1664.81 2397.53 2840.71 3547.29 3752.96 3714.74
tail(SS_TS)
## Jul Aug Sep Oct Nov Dec
## 2001 26155.15 28586.52 30505.41 30821.33 46634.38 104660.67
str(SS_TS)
## Time-Series [1:84] from 1995 to 2002: 1665 2398 2841 3547 3753 ...
#Breaking out the training vs. validation sets on the Souvenir Sales dataset.
ValidLen <- 12
TrainLen <- length(SS_TS) - ValidLen
SS_Train <- window(SS_TS, start=c(1995,1), end=c(1995, TrainLen))
SS_Valid <- window(SS_TS, start=c(1995, TrainLen+1), end=c(1995, TrainLen+ValidLen))
SS_Log <- tslm(log(SS_Train) ~trend + season)
summary(SS_Log)
##
## Call:
## tslm(formula = log(SS_Train) ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4529 -0.1163 0.0001 0.1005 0.3438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.646363 0.084120 90.898 < 2e-16 ***
## trend 0.021120 0.001086 19.449 < 2e-16 ***
## season2 0.282015 0.109028 2.587 0.012178 *
## season3 0.694998 0.109044 6.374 3.08e-08 ***
## season4 0.373873 0.109071 3.428 0.001115 **
## season5 0.421710 0.109109 3.865 0.000279 ***
## season6 0.447046 0.109158 4.095 0.000130 ***
## season7 0.583380 0.109217 5.341 1.55e-06 ***
## season8 0.546897 0.109287 5.004 5.37e-06 ***
## season9 0.635565 0.109368 5.811 2.65e-07 ***
## season10 0.729490 0.109460 6.664 9.98e-09 ***
## season11 1.200954 0.109562 10.961 7.38e-16 ***
## season12 1.952202 0.109675 17.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1888 on 59 degrees of freedom
## Multiple R-squared: 0.9424, Adjusted R-squared: 0.9306
## F-statistic: 80.4 on 12 and 59 DF, p-value: < 2.2e-16
After building the regression model above, this model is then used to forecast the sales in February 2002. The time period 86 is used to forecast 2 period beyond the time series and season2 is used for February.
FebForecast <- SS_Log$coefficients["(Intercept)"] + SS_Log$coefficients["trend"]*86 + SS_Log$coefficients["season2"]
exp(FebForecast)
## (Intercept)
## 17062.99
The forecast for February 2002 based on the training set is 17,062.99 Australian dollars.
SS_ACF <- Acf(SS_Log$residuals, lag.max=15)
SS_ACF
##
## Autocorrelations of series 'SS_Log$residuals', by lag
##
## 0 1 2 3 4 5 6 7 8 9
## 1.000 0.459 0.485 0.194 0.088 0.154 0.016 0.030 0.106 0.034
## 10 11 12 13 14 15
## 0.152 -0.055 -0.012 -0.047 -0.077 -0.023
SS_ARModel2 <- Arima(SS_Log$residuals, order=c(2,0,0))
SS_ARModel2
## Series: SS_Log$residuals
## ARIMA(2,0,0) with non-zero mean
##
## Coefficients:
## ar1 ar2 mean
## 0.3072 0.3687 -0.0025
## s.e. 0.1090 0.1102 0.0489
##
## sigma^2 estimated as 0.0205: log likelihood=39.03
## AIC=-70.05 AICc=-69.46 BIC=-60.95
#Calculating the t statistics for each: coefficient / s.e.
#>2 (or <-2) is statistically significant
SS_ARModel2$coef["ar1"] /sqrt(diag(vcov(SS_ARModel2)))["ar1"]
## ar1
## 2.819441
Statistically significant
S.E of AR(1)
sqrt(diag(vcov(SS_ARModel2)))["ar1"]
## ar1
## 0.1089629
#Calculating the t statistics for each: coefficient / s.e.
#>2 (or <-2) is statistically significant
SS_ARModel2$coef["ar2"] / sqrt(diag(vcov(SS_ARModel2)))["ar2"]
## ar2
## 3.346371
Statistically significant
S.E. of AR(2)
sqrt(diag(vcov(SS_ARModel2)))["ar2"]
## ar2
## 0.1101939
#Estimating p-values based on normal distribution for AR(1)
2*pnorm(-abs(SS_ARModel2$coef["ar1"]/0.1089629))
## ar1
## 0.004810732
The p-value for the AR(1) coefficient is 0.004810732.
#Estimating p-values based on normal distribution for AR(2)
2*pnorm(-abs(SS_ARModel2$coef["ar2"]/0.1101939))
## ar2
## 0.0008187679
The p-value for the AR(2) coefficient is 0.0008187669.
Acf(SS_ARModel2$residuals)
Figure 7 shows us that there is positive correlation between AR(1) and AR(2) and that if we account for autocorrelation in the residuals we will be able to improve the forecast.
After looking at the calculated p-values, we see that both AR(1) coefficient and AR(2) coefficient are statistically significant. The AR(2) model (or AR with a lag of 2) seems to be the more appropriate choice because the p-value is smaller and therefore more statistically significant, this means that the regression forecasts that are 2 time periods apart are correlated moreso than those 1 time period apart.
Using the regression model and the AR(2) model, a forecast for January 2002 can be developed.
LinearRegressionForecast <- forecast(SS_Log, h=ValidLen)
LinearRegressionForecast
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2001 9.188097 8.917220 9.458974 8.769890 9.606304
## Feb 2001 9.491232 9.220354 9.762109 9.073024 9.909439
## Mar 2001 9.925335 9.654457 10.196212 9.507127 10.343542
## Apr 2001 9.625329 9.354452 9.896207 9.207122 10.043537
## May 2001 9.694286 9.423408 9.965163 9.276078 10.112493
## Jun 2001 9.740741 9.469864 10.011619 9.322534 10.158949
## Jul 2001 9.898195 9.627318 10.169072 9.479988 10.316402
## Aug 2001 9.882831 9.611954 10.153708 9.464624 10.301038
## Sep 2001 9.992619 9.721742 10.263496 9.574412 10.410826
## Oct 2001 10.107664 9.836787 10.378542 9.689457 10.525872
## Nov 2001 10.600248 10.329370 10.871125 10.182040 11.018455
## Dec 2001 11.372615 11.101738 11.643493 10.954408 11.790823
ErrorForecast <- forecast(SS_ARModel2, h=ValidLen)
ErrorForecast
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2001 0.107882119 -0.07561892 0.2913832 -0.1727585 0.3885227
## Feb 2001 0.098551352 -0.09341395 0.2905167 -0.1950342 0.3921370
## Mar 2001 0.069245043 -0.14069093 0.2791810 -0.2518243 0.3903144
## Apr 2001 0.056801003 -0.15830920 0.2719112 -0.2721817 0.3857837
## May 2001 0.042171322 -0.17774923 0.2620919 -0.2941681 0.3785108
## Jun 2001 0.033088136 -0.18905521 0.2552315 -0.3066508 0.3728271
## Jul 2001 0.024902959 -0.19881529 0.2486212 -0.3172446 0.3670505
## Aug 2001 0.019038932 -0.20554500 0.2436229 -0.3244325 0.3625104
## Sep 2001 0.014219137 -0.21092154 0.2393598 -0.3301038 0.3585421
## Oct 2001 0.010576068 -0.21489090 0.2360430 -0.3342459 0.3553980
## Nov 2001 0.007679567 -0.21798999 0.2333491 -0.3374522 0.3528114
## Dec 2001 0.005446339 -0.22034478 0.2312375 -0.3398714 0.3507641
After creating forecasts for both the regression model and one using the residuals AR(2) model, we can combine them to forecast an adjusted forecast using both models.
adjustedForecast <- LinearRegressionForecast$mean + ErrorForecast$mean
adjustedForecast
## Jan Feb Mar Apr May Jun Jul
## 2001 9.295979 9.589783 9.994580 9.682130 9.736457 9.773830 9.923098
## Aug Sep Oct Nov Dec
## 2001 9.901870 10.006838 10.118240 10.607927 11.378062
exp(adjustedForecast)
## Jan Feb Mar Apr May Jun Jul
## 2001 10894.13 14614.70 21907.40 16028.61 16923.47 17567.92 20396.07
## Aug Sep Oct Nov Dec
## 2001 19967.68 22177.61 24791.11 40454.26 87383.49
The improved sales forecast for souvenirs for January 2001 is 10,894.13 Australian dollars.
plot(SS_Valid,xlab="Month in 2001",ylab="Sales (in thousands of Australian dollars)",bty="l",xaxt="n",yaxt="n",lwd=2)
axis(1, at=seq(2001,2001+11/12,1/12), labels=c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
axis(2,at=seq(0,110000,10000),labels=format(seq(0,110,10)),las=2)
lines(exp(adjustedForecast),col="Red",lwd=1.5)
legend(2001,110000, c("Validation Period", "Adjusted Forecast"), lty=c(1,1), col=c("black", "Red"), bty="n")
Appliances <- read.csv("ApplianceShipments.csv", stringsAsFactors = FALSE)
head(Appliances)
## Quarter Shipments
## 1 Q1-1985 4009
## 2 Q2-1985 4321
## 3 Q3-1985 4224
## 4 Q4-1985 3944
## 5 Q1-1986 4123
## 6 Q2-1986 4522
tail(Appliances)
## Quarter Shipments
## 15 Q3-1988 4417
## 16 Q4-1988 4258
## 17 Q1-1989 4245
## 18 Q2-1989 4900
## 19 Q3-1989 4585
## 20 Q4-1989 4533
str(Appliances)
## 'data.frame': 20 obs. of 2 variables:
## $ Quarter : chr "Q1-1985" "Q2-1985" "Q3-1985" "Q4-1985" ...
## $ Shipments: int 4009 4321 4224 3944 4123 4522 4657 4030 4493 4806 ...
Appliances_TS <- ts(Appliances$Shipments, start=c(1985,1), frequency=4)
autoplot(Appliances_TS)
If we assume quarterly seasonality exists then lag-4 autocorrelation would have the largest coefficient.
AppliancesACF <- Acf(Appliances_TS)
AppliancesACF
##
## Autocorrelations of series 'Appliances_TS', by lag
##
## 0 1 2 3 4 5 6 7 8 9
## 1.000 0.261 -0.098 0.164 0.387 -0.030 -0.269 0.081 0.086 -0.168
## 10 11 12 13
## -0.325 -0.019 0.047 -0.096
As Figure 11 and the actual values of the coefficients shows us, the largest coefficient value is at lag-4.