Due Monday, April 2nd, 2018 at 11:59PM: Problems 2, 3 and 4 from Chapter 7 of Shmueli
walmart_stock_data <- read.xls("WalMartStock.xls")
walmart.ts <- ts(walmart_stock_data[,2], start = c(2001),frequency=252)
#252 is the number of days that the Stock Market is open: 365.25(days in a year) * 5/7(represent weekdays) = 260.89 - 9(holidays) = 251.89 rounded to 252.
#Plotting the Daily closing price of Wal-Mart stock
yrange = range(walmart.ts)
plot(walmart.ts, main = "Daily Closing Price of Wal-Mart Stock", xlab = "Time", ylab = "Close Prices ($)", bty = "l", xaxt ="n")
#X-axis
axis(1, at=c(2001.0, 2001.2, 2001.4, 2001.6, 2001.8, 2002.0), labels=c("Mar-01", "May-01", "Jul-01","Sep-01", "Nov-01", "Jan-02"))
par(mfrow = c(1,2))
Acf(walmart.ts, main = "ACF Plot for Close", lag.max = 10)
Acf(diff(walmart.ts, lag=1), main="ACF Plot for Differenced Series", lag.max = 10)
plot(diff(walmart.ts, lag = 1), main = "Time Plot of Walmart Differenced Series", xaxt = "n")
axis(1, at=c(2001.0, 2001.2, 2001.4, 2001.6, 2001.8, 2002.0), labels=c("Mar-01", "May-01", "Jul-01","Sep-01", "Nov-01", "Jan-02"))
# Call Arima() with order=c(1,0,0) is the same as an AR(1) model
walmart_fit <- arima(walmart.ts, order = c(1,0,0))
pander(walmart_fit)
Call: arima(x = walmart.ts, order = c(1, 0, 0))
| Â | ar1 | intercept |
|---|---|---|
| 0.9558 | 52.95 | |
| s.e. | 0.01869 | 1.328 |
Calculating the p-value from a t-distribution per the lecture video
#Calculating the p-value from a t-distribution per the lecture video
pander(2*pt(-abs((1 - walmart_fit$coef["ar1"]) / sqrt(diag(vcov(walmart_fit)))["ar1"]), df=length(walmart.ts)-1))
| ar1 |
|---|
| 0.0189 |
Calculating the p-value from a t-distribution using the 0.01869 s.e. value
#Calculating the p-value from a t-distribution using the 0.0187 s.e. value
pander(2*pt(-abs((1 - walmart_fit$coef["ar1"]) / 0.01869 ), df=length(walmart.ts)-1))
| ar1 |
|---|
| 0.0189 |
Calculating the p-value using the normal distribution per the lecture video
#Calculating the p-value using the normal distribution per the lecture video
pander(2*pnorm(-abs((1 - walmart_fit$coef["ar1"]) / sqrt(diag(vcov(walmart_fit)))["ar1"])))
| ar1 |
|---|
| 0.01812 |
Calculating the p-value using the normal distribution with 0.01869 s.e. value
#Calculating the p-value using the normal distribution with `0.01869` s.e. value
pander(2*pnorm(-abs((1 - walmart_fit$coef["ar1"]) / 0.01869 )["ar1"]))
| ar1 |
|---|
| 0.01812 |
#Upload of the SouvenirSales.xls file
souvenir <- read.xls("SouvenirSales.xls")
#partitioned
souvenir.ts <- ts(souvenir[,2], start = c(1995,1), frequency = 12)
souvenir_nValid <- 12
souvenir_nTrain <- length(souvenir.ts) - souvenir_nValid
souvenir_train.ts <- window(souvenir.ts, start = c(1995,1), end = c(1995, souvenir_nTrain))
souvenir_valid.ts <- window(souvenir.ts, start = c(1995, souvenir_nTrain + 1), end = c(1995, souvenir_nTrain + souvenir_nValid))
plot(souvenir.ts, main = "Monthly Souvenir Sales by Shop X in Queensland, Australia", xlab = "Year", ylab = "Sales (Australian Dollars)", type = "n", xaxt = "n", yaxt = "n", bty = "l")
lines(souvenir.ts, bty = "l", lwd = 2, col = "000033", type = "o", pch = 18)
axis(1, at = seq(1995,2002, 1), labels = format(seq(1995,2002, 1)))
axis(2, at = seq(0, 125000, 25000), labels = format(seq(0, 125000, 25000), las = 2))
souvenir_model <- tslm(log(souvenir_train.ts) ~ trend + season, lambda=0)
pander(summary(souvenir_model))
| Â | Estimate | Std. Error | t value | Pr(>|t|) |
|---|---|---|---|---|
| (Intercept) | 7.646 | 0.08412 | 90.9 | 4.077e-65 |
| trend | 0.02112 | 0.001086 | 19.45 | 2.416e-27 |
| season2 | 0.282 | 0.109 | 2.587 | 0.01218 |
| season3 | 0.695 | 0.109 | 6.374 | 3.078e-08 |
| season4 | 0.3739 | 0.1091 | 3.428 | 0.001115 |
| season5 | 0.4217 | 0.1091 | 3.865 | 0.0002791 |
| season6 | 0.447 | 0.1092 | 4.095 | 0.0001301 |
| season7 | 0.5834 | 0.1092 | 5.341 | 1.551e-06 |
| season8 | 0.5469 | 0.1093 | 5.004 | 5.366e-06 |
| season9 | 0.6356 | 0.1094 | 5.811 | 2.654e-07 |
| season10 | 0.7295 | 0.1095 | 6.664 | 9.98e-09 |
| season11 | 1.201 | 0.1096 | 10.96 | 7.38e-16 |
| season12 | 1.952 | 0.1097 | 17.8 | 2.129e-25 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 72 | 0.1888 | 1 | 1 |
plot(souvenir.ts, main = "Monthly Log Sales by Shop X in Queensland, Australia", xlab = "Year", ylab = "Sales (Australian Dollars)", type = "n", xaxt = "n", yaxt = "n", bty = "l")
lines(souvenir_train.ts, lwd = 2)
lines(souvenir_model$fitted.values, col = "#0080ff", lwd=2)
axis(1, at = seq(1995,2002, 1), labels = format(seq(1995,2002, 1)))
axis(2, at = seq(0, 125000, 25000), labels = format(seq(0, 125000, 25000), las = 2))
legend(x = "topleft", legend = c("Souvenir sales", "Exponential model"), col = c("black", "#0080ff"), lty = c(1, 1), bty = "n")
model_forecast <- forecast(souvenir_model, h = 14)
pander(model_forecast$mean, caption = "Souvenir Sales Forecasts from January 2001 to February 2002", split.table = Inf)
| Â | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2001 | 9780 | 13243 | 20442 | 15144 | 16225 | 16996 | 19894 | 19591 | 21864 | 24530 | 40145 | 86909 |
| 2002 | 12601 | 17063 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Acf(model_forecast$residuals, lag.max = 15, main = "ACF plot \n until lag-15")
# Call Arima() with order=c(2,0,0) is the same as an AR(2) model
souvenir_Arima2 <- Arima(souvenir_model$residuals, order = c(2,0,0))
pander(souvenir_Arima2)
Call: Arima(y = souvenir_model$residuals, order = c(2, 0, 0))
| Â | ar1 | ar2 | intercept |
|---|---|---|---|
| 0.3072 | 0.3687 | -0.002509 | |
| s.e. | 0.109 | 0.1102 | 0.04885 |
sigma^2 estimated as 0.0205: log likelihood = 39.03, aic = -70.05
Calculating the t statistics for AR(1)
# Calculate the t statistics for each: coefficient / s.e.
# Rough rule is anything > 2 (or < -2) is statistically significant
pander(souvenir_Arima2$coef["ar1"] / sqrt(diag(vcov(souvenir_Arima2)))["ar1"])
| ar1 |
|---|
| 2.819 |
Calculating the t statistics for AR(2)
pander(souvenir_Arima2$coef["ar2"] / sqrt(diag(vcov(souvenir_Arima2)))["ar2"])
| ar2 |
|---|
| 3.346 |
Calculating the p-value based on the normal distribution for AR(1)
#Now estimate p-value based on the normal distribution
pander(2*pnorm(-abs(souvenir_Arima2$coef["ar1"] / sqrt(diag(vcov(souvenir_Arima2)))["ar1"])))
| ar1 |
|---|
| 0.004811 |
Calculating the p-value based on the normal distribution for AR(2)
#Now estimate p-value based on the normal distribution
pander(2*pnorm(-abs(souvenir_Arima2$coef["ar2"] / sqrt(diag(vcov(souvenir_Arima2)))["ar2"])))
| ar2 |
|---|
| 0.0008188 |
i. Examining the ACF plot and the estimated coefficients of the AR(2) model (and their statistical significance), what can we learn about the regression forecasts?
The forecasts will not be a random walk but will have a pattern that you can use to perform forecasts. we see there is some trend that can be further modeled to improve the Souvenir Sales forecast.
Upon calculating the t-statistics for both AR(1) at 2.819, and AR(2)at 3.346, we were able to conclude that both are statistically significant as they are greater than 2. And when we observe the results of calculating p-values for AR(1) at 0.004811, and AR(2) at 0.0008188 it is clear that both fall below the α-value of 0.01 and are therefore statistically significant. Despite this, when comparing the two it would appear as though AR(2) is a better fit.
However, we will also examine the autocorrelations of the series residuals-of-residuals. Because no lags extend beyond the dashed blue lines, we can conclude that we have effectively captured the autocorrelations.
# Create an ACF plot of the residuals from the AR(2) model
Acf(souvenir_Arima2$residuals)
#Generate a forecast for the error terms using residuals AR(2) model
errorForecast_souvenir <- forecast(souvenir_Arima2, h = 13)
errorForecast_souvenir
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2001 0.107882119 -0.07561892 0.2913832 -0.1727585 0.3885227
## Feb 2001 0.098551352 -0.09341395 0.2905167 -0.1950342 0.3921370
## Mar 2001 0.069245043 -0.14069093 0.2791810 -0.2518243 0.3903144
## Apr 2001 0.056801003 -0.15830920 0.2719112 -0.2721817 0.3857837
## May 2001 0.042171322 -0.17774923 0.2620919 -0.2941681 0.3785108
## Jun 2001 0.033088136 -0.18905521 0.2552315 -0.3066508 0.3728271
## Jul 2001 0.024902959 -0.19881529 0.2486212 -0.3172446 0.3670505
## Aug 2001 0.019038932 -0.20554500 0.2436229 -0.3244325 0.3625104
## Sep 2001 0.014219137 -0.21092154 0.2393598 -0.3301038 0.3585421
## Oct 2001 0.010576068 -0.21489090 0.2360430 -0.3342459 0.3553980
## Nov 2001 0.007679567 -0.21798999 0.2333491 -0.3374522 0.3528114
## Dec 2001 0.005446339 -0.22034478 0.2312375 -0.3398714 0.3507641
## Jan 2002 0.003692175 -0.22217346 0.2295578 -0.3417395 0.3491239
#Create the adjusted forecast by adding the two forecasts together
adjustedForecast <- model_forecast$mean + errorForecast_souvenir$mean
pander(adjustedForecast, split.table = Inf)
| Â | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2001 | 9780 | 13243 | 20442 | 15144 | 16225 | 16996 | 19894 | 19591 | 21865 | 24530 | 40145 | 86909 |
| 2002 | 12601 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
plot(souvenir_valid.ts, main="Adjusted Monthly Souvenir Sales Forecast \n by Shop X in Queensland, Australia", xlab="Year 2001", ylab = "Sales (Australian Dollars)", xaxt="n", yaxt="n", bty="l", lwd=2)
#Add the X-axis
axis(1, at = seq(2001,2001+11/12, 1/12), labels = c("Jan.", "Feb.", "Mar.", "Apr.", "May", "Jun.", "Jul.", "Aug.", "Sep.", "Oct.", "Nov.", "Dec."))
#Add the Y-axis
axis(2, at = seq(0, 125000, 25000), labels = format(seq(0, 125000, 25000), las = 2))
#Add the adjusted forecast
lines(adjustedForecast, col="#64B5F6", lwd=2)
#Add the legend
legend(x = "topleft", legend = c("Actual sales", "Adjusted Forecast"), col = c("black", "#64B5F6"), lty = c(1, 1), bty = "n")
We can also look at the accuracy metrics.
pander(accuracy(adjustedForecast, souvenir_valid.ts), split.table = Inf)
|  | ME | RMSE | MAE | MPE | MAPE | ACF1 | Theil’s U |
|---|---|---|---|---|---|---|---|
| Test set | 4824 | 7101 | 5192 | 12.36 | 15.52 | 0.4245 | 0.461 |
appliance <- read.csv("ApplianceShipments.csv")
appliance.ts <- ts(appliance[,2], start = c(1985,1), frequency = 4)
#Plotting Appliance shipments
yrange = range(appliance.ts)
plot(appliance.ts, main="Number of Appliances Shipped by Quarter from 1985 to 1989", xlab="Quarter", ylab="Shipments (millions $)", xaxt="n", bty="l", ylim=c(3000,5000), lwd=3, type = "o", pch = 19)
#Add the X-axis
axis(1, at = seq(1985,1989+3/4, 1/4), labels = c("Q1-85", "Q2-85", "Q3-85", "Q4-85", "Q1-86", "Q2-86", "Q3-86", "Q4-86", "Q1-87", "Q2-87", "Q3-87", "Q4-87", "Q1-88", "Q2-88", "Q3-88", "Q4-88", "Q1-89", "Q2-89", "Q3-89", "Q4-89"))
appliance_acf <- Acf(appliance.ts, main = "ACF Plot of Appliances Shipped by Quarter", lwd=3)
appliance_acf
##
## Autocorrelations of series 'appliance.ts', by lag
##
## 0 1 2 3 4 5 6 7 8 9
## 1.000 0.261 -0.098 0.164 0.387 -0.030 -0.269 0.081 0.086 -0.168
## 10 11 12 13
## -0.325 -0.019 0.047 -0.096
From both Chart 3.2 and Table 3.1 above, we can see that lag-4 has the highest coefficient of 0.387. This tells us that the same quarters year-to-year have the most correlation in the Shipments of Household Appliances series.