#setwd("~/wd678/Unit 5a")
#install.packages("forecast")
library(forecast)
## Warning: package 'forecast' was built under R version 3.3.3
dataFrame <- read.csv("CanadianWorkHours.csv", stringsAsFactors=FALSE, header=TRUE)
workHours.ts <- ts(dataFrame$Hours.per.week, start=c(1966,1), end = c(2000,1), freq=1)
plot(workHours.ts, bty="l", main="Original series")
hoursLinear <- tslm(workHours.ts ~ trend)
summary(hoursLinear)
##
## Call:
## tslm(formula = workHours.ts ~ trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.20457 -0.28761 0.04779 0.30210 1.23190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.416134 0.175818 212.811 < 2e-16 ***
## trend -0.061373 0.008518 -7.205 2.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.509 on 33 degrees of freedom
## Multiple R-squared: 0.6113, Adjusted R-squared: 0.5996
## F-statistic: 51.91 on 1 and 33 DF, p-value: 2.928e-08
hoursQuad <- tslm(workHours.ts ~ trend + I(trend^2))
summary(hoursQuad)
##
## Call:
## tslm(formula = workHours.ts ~ trend + I(trend^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.94503 -0.20964 -0.01652 0.31862 0.60160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.1644156 0.2176599 175.340 < 2e-16 ***
## trend -0.1827154 0.0278786 -6.554 2.21e-07 ***
## I(trend^2) 0.0033706 0.0007512 4.487 8.76e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4049 on 32 degrees of freedom
## Multiple R-squared: 0.7614, Adjusted R-squared: 0.7465
## F-statistic: 51.07 on 2 and 32 DF, p-value: 1.1e-10
yrange=range(workHours.ts)
plot(c(1966,2000), yrange, type="n", xlab="Year", "ylab"="Hours Per Week", bty="l", xaxt="n", yaxt="n")
axis(1, at=seq(1965,2000,5), labels=format(seq(1965,2000,5)))
axis(2, at=seq(34.5,38.0,0.5), labels=format(seq(34.5,38.0,0.5)), las=2)
lines(workHours.ts)
lines(hoursLinear$fitted, col="red")
lines(hoursQuad$fitted, col="yellow")
legend(1986,37.7, c("HrsPerWk", "Linear", "Quadratic"), lty=c(1,1,1), col=c("black", "red", "yellow"), bty="n")
dataF <- read.csv("DepartmentStoreSales.csv", stringsAsFactors=FALSE, header=TRUE)
storeSales.ts <- ts(dataF$Sales, start=c(1,1), freq=4)
plot(storeSales.ts, bty="l", main="Original series")
#lines(storeSales.ts$fitted, lwd = 1, col = "red")
nValid <- 4
nTrain <- length(storeSales.ts) - nValid
train.ts <- window(storeSales.ts, start = c(1,1), end = c(1, nTrain))
valid.ts <- window(storeSales.ts, start = c(1, nTrain + 1), end = c(1, nTrain + nValid))
salesExpo <- tslm(train.ts ~ trend + season, lambda=0)
summary(salesExpo)
##
## Call:
## tslm(formula = train.ts ~ trend + season, lambda = 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.053524 -0.013199 -0.004527 0.014387 0.062681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.748945 0.018725 574.057 < 2e-16 ***
## trend 0.011088 0.001295 8.561 3.70e-07 ***
## season2 0.024956 0.020764 1.202 0.248
## season3 0.165343 0.020884 7.917 9.79e-07 ***
## season4 0.433746 0.021084 20.572 2.10e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03277 on 15 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 7.63e+11 on 4 and 15 DF, p-value: < 2.2e-16
validPeriodForecasts <- forecast(salesExpo, h=nValid)
validPeriodForecasts
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 6 Q1 58793.71 55790.19 61958.92 54090.84 63905.46
## 6 Q2 60951.51 57837.76 64232.89 56076.04 66250.87
## 6 Q3 70920.09 67297.09 74738.13 65247.24 77086.15
## 6 Q4 93788.66 88997.41 98837.86 86286.57 101943.01
yrange=range(storeSales.ts)
plot(c(1,7), yrange, type="n", xlab="Time", ylab="Store Sales(1000s)", bty="l", xaxt="n", yaxt="n", main="Regression model with exponential trend and seasonality")
axis(1,at=seq(1,7,1), labels=format(seq(1,7,1)))
axis(2,at=seq(50000,100000,10000), labels=format(seq(50,100,10)), las=2)
lines(storeSales.ts, bty="l")
lines(salesExpo$fitted, col="red")
lines(validPeriodForecasts$mean, col="blue", lty=2)
legend(1,100000, c("Store Sales", "Exponential Trend+Season", "Forecasts"), lty=c(1,1,2), col=c("black", "red", "blue"), bty="n")
summary(tslm(train.ts ~ trend + season, lambda=0))
##
## Call:
## tslm(formula = train.ts ~ trend + season, lambda = 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.053524 -0.013199 -0.004527 0.014387 0.062681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.748945 0.018725 574.057 < 2e-16 ***
## trend 0.011088 0.001295 8.561 3.70e-07 ***
## season2 0.024956 0.020764 1.202 0.248
## season3 0.165343 0.020884 7.917 9.79e-07 ***
## season4 0.433746 0.021084 20.572 2.10e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03277 on 15 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 7.63e+11 on 4 and 15 DF, p-value: < 2.2e-16
yrange=range(train.ts)
plot(c(1,6), yrange, type="n", xlab="Time", ylab="Store Sales(1000s)", bty="l", xaxt="n", yaxt="n")
axis(1,at=seq(1,6,1), labels=format(seq(1,6,1)))
axis(2,at=seq(50000,100000,10000), labels=format(seq(50,100,10)), las=2)
lines(train.ts, bty="l")
lines(salesExpo$fitted, col="red")
plot(salesExpo$residuals, type="o", bty="l", main="Residuals for training period in log transformed units")
errorTrain <- train.ts - salesExpo$fitted
plot(errorTrain, type="o", bty="l", main="Residuals for training period in the original scale")
accuracy(salesExpo$fitted, train.ts)
## ME RMSE MAE MPE MAPE ACF1
## Test set 30.63141 1759.359 1388.794 -0.04013518 2.215018 0.5540818
## Theil's U
## Test set 0.1132336
errorValid <- valid.ts - validPeriodForecasts$mean
plot(errorValid, type="o", bty="l", main="Residuals in validation period")
accuracy(validPeriodForecasts$mean, valid.ts)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 5395.009 6076.912 5395.009 6.629046 6.629046 0.2156416 0.4259668
data.Frame <- read.csv("SouvenirSales.csv", stringsAsFactors=FALSE, header=TRUE)
souvSales.ts <- ts(data.Frame$Sales, start=c(1995,1), end = c(2001,12), freq=12)
plot(souvSales.ts, bty="l", main="Original series for Souvenir Sales")
nValid3 <- 12
nTrain3 <- length(souvSales.ts) - nValid3
train3.ts <- window(souvSales.ts, start = c(1995,1), end=c(1995, nTrain3))
valid3.ts <- window(souvSales.ts, start = c(1995, nTrain3 + 1), end = c(1995, nTrain3 + nValid3))
modelA.LinearSeason <- tslm(train3.ts ~ trend + season)
summary(modelA.LinearSeason)
##
## Call:
## tslm(formula = train3.ts ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12592 -2359 -411 1940 33651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3065.55 2640.26 -1.161 0.25029
## trend 245.36 34.08 7.199 1.24e-09 ***
## season2 1119.38 3422.06 0.327 0.74474
## season3 4408.84 3422.56 1.288 0.20272
## season4 1462.57 3423.41 0.427 0.67077
## season5 1446.19 3424.60 0.422 0.67434
## season6 1867.98 3426.13 0.545 0.58766
## season7 2988.56 3427.99 0.872 0.38684
## season8 3227.58 3430.19 0.941 0.35058
## season9 3955.56 3432.73 1.152 0.25384
## season10 4821.66 3435.61 1.403 0.16573
## season11 11524.64 3438.82 3.351 0.00141 **
## season12 32469.55 3442.36 9.432 2.19e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5927 on 59 degrees of freedom
## Multiple R-squared: 0.7903, Adjusted R-squared: 0.7476
## F-statistic: 18.53 on 12 and 59 DF, p-value: 9.435e-16
modelAForecast <- forecast(modelA.LinearSeason, h=nValid3)
modelAForecast
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2001 14846.03 6344.029 23348.03 1719.794 27972.27
## Feb 2001 16210.78 7708.778 24712.78 3084.542 29337.01
## Mar 2001 19745.60 11243.603 28247.60 6619.367 32871.84
## Apr 2001 17044.69 8542.689 25546.69 3918.454 30170.93
## May 2001 17273.68 8771.681 25775.68 4147.446 30399.92
## Jun 2001 17940.83 9438.828 26442.83 4814.592 31067.06
## Jul 2001 19306.78 10804.778 27808.78 6180.542 32433.01
## Aug 2001 19791.16 11289.159 28293.16 6664.924 32917.40
## Sep 2001 20764.50 12262.503 29266.50 7638.267 33890.74
## Oct 2001 21875.97 13373.964 30377.97 8749.729 35002.20
## Nov 2001 28824.31 20322.309 37326.31 15698.074 41950.55
## Dec 2001 50014.59 41512.586 58516.59 36888.351 63140.82
accuracy(modelAForecast$mean, valid3.ts)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 8251.513 17451.55 10055.28 10.53397 26.66568 0.3206228 0.9075924
yrange=range(souvSales.ts)
plot(c(1995,2002), yrange, type="n", xlab="Time", ylab="Souvenir Sales (thousand $)", bty="l", xaxt="n", yaxt="n", main="Model A - Sales modeled by linear trend & monthly predictors")
axis(1,at=seq(1995,2002,1), labels=format(seq(1995,2002,1)))
axis(2,at=seq(1000,105000,20000), labels=format(seq(1,105,20)), las=2)
lines(souvSales.ts, lty=2)
lines(modelA.LinearSeason$fitted, col="red")
lines(modelAForecast$mean, col="blue")
legend(1995,101000, c("Original Series", "Fitted in training period", "Forecasted in validation period"), lty=c(2,1,1), col=c("black", "red", "blue"), bty="n")
modelB <- tslm(log(train3.ts) ~ trend + season)
summary(modelB)
##
## Call:
## tslm(formula = log(train3.ts) ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4529 -0.1163 0.0001 0.1005 0.3438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.646363 0.084120 90.898 < 2e-16 ***
## trend 0.021120 0.001086 19.449 < 2e-16 ***
## season2 0.282015 0.109028 2.587 0.012178 *
## season3 0.694998 0.109044 6.374 3.08e-08 ***
## season4 0.373873 0.109071 3.428 0.001115 **
## season5 0.421710 0.109109 3.865 0.000279 ***
## season6 0.447046 0.109158 4.095 0.000130 ***
## season7 0.583380 0.109217 5.341 1.55e-06 ***
## season8 0.546897 0.109287 5.004 5.37e-06 ***
## season9 0.635565 0.109368 5.811 2.65e-07 ***
## season10 0.729490 0.109460 6.664 9.98e-09 ***
## season11 1.200954 0.109562 10.961 7.38e-16 ***
## season12 1.952202 0.109675 17.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1888 on 59 degrees of freedom
## Multiple R-squared: 0.9424, Adjusted R-squared: 0.9306
## F-statistic: 80.4 on 12 and 59 DF, p-value: < 2.2e-16
modelBForecast <- forecast(modelB, h=nValid3)
modelBForecast
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2001 9.188097 8.917220 9.458974 8.769890 9.606304
## Feb 2001 9.491232 9.220354 9.762109 9.073024 9.909439
## Mar 2001 9.925335 9.654457 10.196212 9.507127 10.343542
## Apr 2001 9.625329 9.354452 9.896207 9.207122 10.043537
## May 2001 9.694286 9.423408 9.965163 9.276078 10.112493
## Jun 2001 9.740741 9.469864 10.011619 9.322534 10.158949
## Jul 2001 9.898195 9.627318 10.169072 9.479988 10.316402
## Aug 2001 9.882831 9.611954 10.153708 9.464624 10.301038
## Sep 2001 9.992619 9.721742 10.263496 9.574412 10.410826
## Oct 2001 10.107664 9.836787 10.378542 9.689457 10.525872
## Nov 2001 10.600248 10.329370 10.871125 10.182040 11.018455
## Dec 2001 11.372615 11.101738 11.643493 10.954408 11.790823
accuracy(exp(modelBForecast$mean), valid3.ts)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 4824.494 7101.444 5191.669 12.35943 15.5191 0.4245018 0.4610253
yrange2=range(log(souvSales.ts))
plot(c(1995,2002), yrange2, type="n", xlab="Time", ylab="log(Sales [$])", bty="l", xaxt="n", yaxt="n", main="Model B - log(Sales) modeled by linear trend & monthly predictors")
axis(1,at=seq(1995,2002,1), labels=format(seq(1995,2002,1)))
axis(2,at=seq(6.8,11.6,1.0), labels=format(seq(6.8,11.6,1.0)), las=2)
lines(log(souvSales.ts), lty=2)
lines(modelB$fitted, col="red")
lines(modelBForecast$mean, col="blue")
legend(1994.75,11.8, c("Original Series", "Fitted in training period", "Forecasted in validation period"), lty=c(2,1,1), col=c("black", "red", "blue"), bty="n")
#yrange4=range(souvSales.ts)
plot(c(2001,2002), c(1000,105000), type="n", xlab="Year 2001", ylab="Sales (thousand $)", bty="l", xaxt="n", yaxt="n", main="Actuals and two models for validation period")
axis(1,at=seq(2001,2001+11/12,1/12), labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"))
axis(2,at=seq(1000,105000,20000), labels=format(seq(1,105,20)), las=2)
lines(valid3.ts, lty=2)
lines(modelAForecast$mean, col="blue")
lines(exp(modelBForecast$mean), col="red")
legend(2001,101000, c("Actuals", "Model B: log(y) Linear+Season", "Model A: y Linear+Season"), lty=c(2,1,1), col=c("black", "red", "blue"), bty="n")
febForecast <- modelB$coefficients["(Intercept)"] + modelB$coefficients["trend"]*86 + modelB$coefficients["season2"]
exp(febForecast)
## (Intercept)
## 17062.99
If the goal was understanding different components of souvenir sales, the data could be modeled by the number of souvenir items sold according to a classification scheme, instead of the total dollar value of the souvenirs sold. Classification schemes could be any of the following: 1) by gender of person purchasing item, 2) by age of purchaser, 3) by the department the item was sold from, 4) by the type of marketing surrounding the item, 5) by profit made from item; etc.
The types and quantities of items sold during each month could be used to inform inventory, staffing numbers, and marketing. External information, relevant to characteristics of weather and customers(tourists) could also be incorporated into a consumer demand related strategy.