library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(mosaic)
## Loading required package: lattice
## Loading required package: ggformula
## Loading required package: ggstance
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median,
## prop.test, quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
library(ggformula)
library(readr)
library(fpp2)
## Loading required package: forecast
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
## Loading required package: fma
## Loading required package: expsmooth
###CHAPTER 5
##1. Daily Electricity Demand in Victoria, Australia, 2014
daily20 <- head(elecdaily, 20)
daily20
## Time Series:
## Start = c(1, 1)
## End = c(3, 6)
## Frequency = 7
## Demand WorkDay Temperature
## 1.000000 174.8963 0 26.0
## 1.142857 188.5909 1 23.0
## 1.285714 188.9169 1 22.2
## 1.428571 173.8142 0 20.3
## 1.571429 169.5152 0 26.1
## 1.714286 195.7288 1 19.6
## 1.857143 199.9029 1 20.0
## 2.000000 205.3375 1 27.4
## 2.142857 228.0782 1 32.4
## 2.285714 258.5984 1 34.0
## 2.428571 201.7970 0 22.4
## 2.571429 187.6298 0 22.5
## 2.714286 254.6636 1 30.0
## 2.857143 322.2323 1 42.4
## 3.000000 343.9934 1 41.5
## 3.142857 347.6376 1 43.2
## 3.285714 332.9455 1 43.1
## 3.428571 219.7517 0 23.7
## 3.571429 186.9816 0 22.3
## 3.714286 228.4876 1 24.0
autoplot(daily20)

#A: There is a positive relationship because at higher temps, higher demand, and at lower temps, lower demand.
daily20 %>% as.data.frame() %>% gf_point(Demand ~ Temperature, data = daily20)

tempmodel <- tslm(Demand ~ Temperature, data=daily20)
summary(tempmodel)
##
## Call:
## tslm(formula = Demand ~ Temperature, data = daily20)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.060 -7.117 -1.437 17.484 27.102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.2117 17.9915 2.179 0.0428 *
## Temperature 6.7572 0.6114 11.052 1.88e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22 on 18 degrees of freedom
## Multiple R-squared: 0.8716, Adjusted R-squared: 0.8644
## F-statistic: 122.1 on 1 and 18 DF, p-value: 1.876e-09
#B: The high p-value in the Breusch-Godfrey test indicates there is no concern for autocorelation; the ACF is not worrisome. The residual plot shows a steady linear growth pattern, indicating that the regression model is not apporpriate.
checkresiduals(tempmodel)

##
## Breusch-Godfrey test for serial correlation of order up to 5
##
## data: Residuals from Linear regression model
## LM test = 3.8079, df = 5, p-value = 0.5774
#C: The model prediction for 35 degrees is reasonable, but for 15 degrees it seems inapprporiate, follwing the trend line.
foretemp <- forecast(tempmodel, newdata = data.frame(Temperature = c(15, 35)))
foretemp
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 3.857143 140.5701 108.6810 172.4591 90.21166 190.9285
## 4.000000 275.7146 245.2278 306.2014 227.57056 323.8586
#D: The prediction intervals are in the output below, as well as shown in the plot bands.
autoplot(foretemp, facets = TRUE)

foretemp
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 3.857143 140.5701 108.6810 172.4591 90.21166 190.9285
## 4.000000 275.7146 245.2278 306.2014 227.57056 323.8586
#E: When we plot the entire data set, it is clear that this is NOT linear!
elecdaily %>% as.data.frame() %>% gf_point(Demand ~ Temperature, data = daily20) + geom_smooth(method='lm', se=FALSE)

##2: Mens 400 Meter Olympic winning times 896-2016.
#A:Winning times are generally trending lower, meaning faster, over time.
autoplot(mens400)

qplot(time(mens400), mens400)
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Warning: Removed 3 rows containing missing values (geom_point).

#B: Average decrease per year is 0.06457 seconds.
time400 <- time(mens400)
(menmodel <- tslm(mens400 ~ time400))
##
## Call:
## tslm(formula = mens400 ~ time400)
##
## Coefficients:
## (Intercept) time400
## 172.48148 -0.06457
#C: There is an outlier in the first data point (year 1896) and a noticeable curved trend around the 0 residual level; linear modelling is not a good fit for this data set.
qplot(time(mens400), residuals(menmodel))
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Warning: Removed 3 rows containing missing values (geom_point).

#D: In 2020, we expect a winning time of 42,04231 seconds; see output for 80 and 95 PIs. This assumes the trend holds linearly, which is unlikely! Current world record is 43.03 seconds!
forecast(menmodel, newdata = data.frame(time400=2020))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2020 42.04231 40.44975 43.63487 39.55286 44.53176
##3: Type Easter(ausbeer), what do we see?
#This shows us which quarter has Easter in it; where partial, Easter is allocated across quarters since in countries that recognize Easter as a legal holiday, it is a few days long.
easter(ausbeer)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1956 0.67 0.33 0.00 0.00
## 1957 0.00 1.00 0.00 0.00
## 1958 0.00 1.00 0.00 0.00
## 1959 1.00 0.00 0.00 0.00
## 1960 0.00 1.00 0.00 0.00
## 1961 0.33 0.67 0.00 0.00
## 1962 0.00 1.00 0.00 0.00
## 1963 0.00 1.00 0.00 0.00
## 1964 1.00 0.00 0.00 0.00
## 1965 0.00 1.00 0.00 0.00
## 1966 0.00 1.00 0.00 0.00
## 1967 1.00 0.00 0.00 0.00
## 1968 0.00 1.00 0.00 0.00
## 1969 0.00 1.00 0.00 0.00
## 1970 1.00 0.00 0.00 0.00
## 1971 0.00 1.00 0.00 0.00
## 1972 0.33 0.67 0.00 0.00
## 1973 0.00 1.00 0.00 0.00
## 1974 0.00 1.00 0.00 0.00
## 1975 1.00 0.00 0.00 0.00
## 1976 0.00 1.00 0.00 0.00
## 1977 0.00 1.00 0.00 0.00
## 1978 1.00 0.00 0.00 0.00
## 1979 0.00 1.00 0.00 0.00
## 1980 0.00 1.00 0.00 0.00
## 1981 0.00 1.00 0.00 0.00
## 1982 0.00 1.00 0.00 0.00
## 1983 0.00 1.00 0.00 0.00
## 1984 0.00 1.00 0.00 0.00
## 1985 0.00 1.00 0.00 0.00
## 1986 1.00 0.00 0.00 0.00
## 1987 0.00 1.00 0.00 0.00
## 1988 0.00 1.00 0.00 0.00
## 1989 1.00 0.00 0.00 0.00
## 1990 0.00 1.00 0.00 0.00
## 1991 1.00 0.00 0.00 0.00
## 1992 0.00 1.00 0.00 0.00
## 1993 0.00 1.00 0.00 0.00
## 1994 0.00 1.00 0.00 0.00
## 1995 0.00 1.00 0.00 0.00
## 1996 0.00 1.00 0.00 0.00
## 1997 1.00 0.00 0.00 0.00
## 1998 0.00 1.00 0.00 0.00
## 1999 0.00 1.00 0.00 0.00
## 2000 0.00 1.00 0.00 0.00
## 2001 0.00 1.00 0.00 0.00
## 2002 1.00 0.00 0.00 0.00
## 2003 0.00 1.00 0.00 0.00
## 2004 0.00 1.00 0.00 0.00
## 2005 1.00 0.00 0.00 0.00
## 2006 0.00 1.00 0.00 0.00
## 2007 0.00 1.00 0.00 0.00
## 2008 1.00 0.00 0.00 0.00
## 2009 0.00 1.00 0.00 0.00
## 2010 0.00 1.00
##4: Elasticity Coefficient
#Elasticity coefficient shows the percentage change that will occur in Y when X changes 1 percent. It is the ratio (% change Y) / (% change X). When you turn a linear model into a log-log model; meaning logY = INtercept + BetalogX, Beta will become the elasticity coefficient.
##5: Fancy data set, monthly sales of a shop starting January 1987. Spikes at Christmas and Surf Festival (March) expected.
#A: sales are increasing over time. There are noticeable spikes in December and March, with December being about 4 times taller than March.
autoplot(fancy, xlab="Year", ylab="Sales")

#B: TO stabilize the variance we would take logs of the data. In a plot, we see this improvement.
autoplot(log(fancy), xlab="Year", ylab="Log Sales")

#C: Formulate regression with seasonal variable and a dummy variable for surf festival.
surf <- cycle(fancy) == 3
surf[3] <- FALSE
salesmodel <- tslm(fancy ~ trend + season + surf, lambda = 0)
summary(salesmodel)
##
## Call:
## tslm(formula = fancy ~ trend + season + surf, lambda = 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33673 -0.12757 0.00257 0.10911 0.37671
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6196670 0.0742471 102.626 < 2e-16 ***
## trend 0.0220198 0.0008268 26.634 < 2e-16 ***
## season2 0.2514168 0.0956790 2.628 0.010555 *
## season3 0.2660828 0.1934044 1.376 0.173275
## season4 0.3840535 0.0957075 4.013 0.000148 ***
## season5 0.4094870 0.0957325 4.277 5.88e-05 ***
## season6 0.4488283 0.0957647 4.687 1.33e-05 ***
## season7 0.6104545 0.0958039 6.372 1.71e-08 ***
## season8 0.5879644 0.0958503 6.134 4.53e-08 ***
## season9 0.6693299 0.0959037 6.979 1.36e-09 ***
## season10 0.7473919 0.0959643 7.788 4.48e-11 ***
## season11 1.2067479 0.0960319 12.566 < 2e-16 ***
## season12 1.9622412 0.0961066 20.417 < 2e-16 ***
## surfTRUE 0.5015151 0.1964273 2.553 0.012856 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 4.105e+10 on 13 and 70 DF, p-value: < 2.2e-16
autoplot(fancy, ylab = "Sales", xlab = "Year") + autolayer(fitted(salesmodel), color = "maroon", lwd = 1, series = "Fitted")

#D: The residual plot over time shows evidence of non-linearity, as it appears cyclical; the residuals to fitted is OK.
autoplot(residuals(salesmodel))

qplot(fitted(salesmodel), residuals(salesmodel))
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

#E: Boxplots panelled by month shows unequal variance across the months.
month <- factor(cycle(residuals(salesmodel)), labels = month.abb)
ggplot() + geom_boxplot((aes(x=month, y = residuals(salesmodel), group=month)))
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

gf_boxplot(residuals(salesmodel) ~ month)
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

#F: The intercept is outside the range so uninterpretable; the trend coefficent shows that with each year, the log of sales increases by .02.
#December sales increase over November by log 1.96, on average, the highest value. The surf "season" increases sales by log .50, on average.
coefficients(salesmodel)
## (Intercept) trend season2 season3 season4 season5
## 7.61966701 0.02201983 0.25141682 0.26608280 0.38405351 0.40948697
## season6 season7 season8 season9 season10 season11
## 0.44882828 0.61045453 0.58796443 0.66932985 0.74739195 1.20674790
## season12 surfTRUE
## 1.96224123 0.50151509
#G: The B-G test is significant (p = 0.0025), indicating there is a likelyihood of serial corelation.
checkresiduals(salesmodel)

##
## Breusch-Godfrey test for serial correlation of order up to 17
##
## data: Residuals from Linear regression model
## LM test = 37.954, df = 17, p-value = 0.002494
#H: See graphical & summary output below.
future.surf <- rep(FALSE, 36)
future.surf[c(3, 15, 27)] <- TRUE
foresale <- forecast(salesmodel, h=36, newdata = data.frame(surf=future.surf))
foresale
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 1994 13244.70 10285.82 17054.73 8969.583 19557.43
## Feb 1994 17409.81 13520.45 22418.00 11790.284 25707.73
## Mar 1994 29821.65 23129.40 38450.24 20155.412 44123.68
## Apr 1994 20774.16 16133.21 26750.16 14068.696 30675.62
## May 1994 21783.73 16917.24 28050.15 14752.395 32166.37
## Jun 1994 23162.27 17987.81 29825.24 15685.969 34201.95
## Jul 1994 27831.56 21613.98 35837.72 18848.111 41096.73
## Aug 1994 27818.48 21603.82 35820.87 18839.249 41077.41
## Sep 1994 30848.42 23956.87 39722.43 20891.193 45551.50
## Oct 1994 34095.57 26478.61 43903.67 23090.230 50346.32
## Nov 1994 55176.84 42850.31 71049.28 37366.903 81475.41
## Dec 1994 120067.79 93244.59 154607.08 81312.400 177294.90
## Jan 1995 17250.40 13357.65 22277.59 11629.938 25587.08
## Feb 1995 22675.20 17558.28 29283.31 15287.252 33633.55
## Mar 1995 38840.85 30046.98 50208.44 26146.972 57697.39
## Apr 1995 27057.06 20951.33 34942.16 18241.435 40133.06
## May 1995 28371.96 21969.51 36640.25 19127.918 42083.42
## Jun 1995 30167.42 23359.80 38958.95 20338.387 44746.58
## Jul 1995 36248.88 28068.91 46812.70 24438.412 53767.06
## Aug 1995 36231.84 28055.72 46790.69 24426.922 53741.78
## Sep 1995 40178.16 31111.50 51887.06 27087.467 59595.26
## Oct 1995 44407.37 34386.35 57348.77 29938.733 65868.34
## Nov 1995 71864.42 55647.40 92807.48 48449.831 106594.69
## Dec 1995 156380.86 121091.75 201954.08 105429.448 231955.81
## Jan 1996 22467.57 17336.40 29117.46 15065.329 33506.86
## Feb 1996 29533.04 22788.25 38274.14 19802.984 44043.89
## Mar 1996 50587.81 39009.73 65602.25 33887.802 75517.62
## Apr 1996 35240.15 27191.96 45670.42 23629.808 52555.15
## May 1996 36952.72 28513.41 47889.88 24778.151 55109.18
## Jun 1996 39291.20 30317.82 50920.48 26346.183 58596.65
## Jul 1996 47211.93 36429.60 61185.57 31657.322 70409.18
## Aug 1996 47189.73 36412.48 61156.80 31642.439 70376.07
## Sep 1996 52329.57 40378.47 67817.91 35088.887 78041.33
## Oct 1996 57837.85 44628.77 74956.52 38782.394 86256.08
## Nov 1996 93598.96 72222.70 121302.09 62761.521 139588.15
## Dec 1996 203676.38 157160.50 263959.89 136572.460 303751.35
autoplot(foresale)

#I: transform completed as we used log in our time series model formulation.
#J: the model would be improved by addresseing the non-linearity issues in the trend line.
##6: Gasoline data set of weekly supply iof US gasoline, 2/2/1991 - 1/20/1997.
head(gasoline)
## Time Series:
## Start = 1991.1
## End = 1991.19582477755
## Frequency = 52.1785714285714
## [1] 6.621 6.433 6.582 7.224 6.875 6.947
gas04 = window(gasoline, end=2004)
#A: the harmonic regression shows more density towards a linear model at higher levels of the fourier term. fourier inputs are x=gas, K=# fourier terms, h=forecast horizon desired.
gasmod2 <- tslm(gas04~trend + fourier(gas04, K=2))
gasmod4 <- tslm(gas04~trend + fourier(gas04, K=4))
gasmod8 <- tslm(gas04~trend + fourier(gas04, K=8))
gasmod12 <- tslm(gas04~trend + fourier(gas04, K=12))
plot.ts(gas04, gasmod2$fitted.values)

plot.ts(gas04, gasmod4$fitted.values)

plot.ts(gas04, gasmod8$fitted.values)

plot.ts(gas04, gasmod12$fitted.values)

#B: Based on lowest CV or AIC, the best # of fourier terms is 8. Based on adjusted R-square, it is also 8.
CV(gasmod2)
## CV AIC AICc BIC AdjR2
## 8.430123e-02 -1.664847e+03 -1.664679e+03 -1.633254e+03 8.010614e-01
CV(gasmod4)
## CV AIC AICc BIC AdjR2
## 7.983256e-02 -1.701660e+03 -1.701261e+03 -1.652014e+03 8.127370e-01
CV(gasmod8)
## CV AIC AICc BIC AdjR2
## 0.0747915 -1745.9534846 -1744.7914051 -1660.2021125 0.8266732
CV(gasmod12)
## CV AIC AICc BIC AdjR2
## 7.412822e-02 -1.752501e+03 -1.750161e+03 -1.630644e+03 8.303062e-01
#C: CHeck residuals - normality is good, but residual plot is not! And B-G test is significant, indicating serial correlation.
checkresiduals(gasmod8)

##
## Breusch-Godfrey test for serial correlation of order up to 104
##
## data: Residuals from Linear regression model
## LM test = 147.46, df = 104, p-value = 0.003284
#D: Forecast & plot future values
fc <- forecast(gasmod8, newdata = data.frame(fourier(gasoline, 8, 52)))
autoplot(fc)

#E: The forecast mathces the data reasonably well.
#Plot along with actual 2005 data.
autoplot(fc) + autolayer(window(gasoline, start=2005, end=2005.99), series = "Actual 2005 Data")

##7: Huron data set of water levels lake Huron, 1875 - 1972.
head(huron)
## Time Series:
## Start = 1875
## End = 1880
## Frequency = 1
## [1] 10.38 11.86 10.97 10.80 9.79 10.39
#A: The water level has a generally downward trend right thru the early 20th century, then rises, and takes a stable level.
autoplot(huron)

#B: The slopes for before/after 1915 are almost exactly opposite, indicating a flat trend post-1915.
huronmodel <- tslm(huron ~ trend)
summary(huronmodel)
##
## Call:
## tslm(formula = huron ~ trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.50997 -0.72726 0.00083 0.74402 2.53565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.202037 0.230111 44.335 < 2e-16 ***
## trend -0.024201 0.004036 -5.996 3.55e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.13 on 96 degrees of freedom
## Multiple R-squared: 0.2725, Adjusted R-squared: 0.2649
## F-statistic: 35.95 on 1 and 96 DF, p-value: 3.545e-08
t <- time(huron)
t.break <- 1915
tb1 <- ts(pmax(0, t - t.break), start = 1875)
huronpw <- tslm(huron ~ t + tb1)
summary(huronpw)
##
## Call:
## tslm(formula = huron ~ t + tb1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.49626 -0.66240 -0.07139 0.85163 2.39222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 132.90870 19.97687 6.653 1.82e-09 ***
## t -0.06498 0.01051 -6.181 1.58e-08 ***
## tb1 0.06486 0.01563 4.150 7.26e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.045 on 95 degrees of freedom
## Multiple R-squared: 0.3841, Adjusted R-squared: 0.3711
## F-statistic: 29.62 on 2 and 95 DF, p-value: 1.004e-10
#C: When we forecast 8 years, up to 1980, we see that the green Piecewise fits quite effectively and the linear model does not.
h <- 8
forelin <- forecast(huronmodel, h=h)
t.new <- t[length(t)] + seq(h)
tb1.new <- tb1[length(tb1)] +seq(h)
newdata <- cbind(t=t.new, tb1 = tb1.new) %>% as.data.frame()
forepw <- forecast(huronpw, newdata = newdata)
autoplot(huron) +
autolayer(fitted(huronmodel), series = "Linear Model") +
autolayer(fitted(huronpw), series = "Piecewise") +
autolayer(forelin, series = "Linear Model") +
autolayer(forepw, series = "Piecewise Model")

###CHAPTER 6
library(seasonal)
##
## Attaching package: 'seasonal'
## The following object is masked from 'package:mosaic':
##
## inspect
## The following object is masked from 'package:tibble':
##
## view
#1: Show that a 3 X 5 MA is equivalent to a 7-term wghted moving average
#If we had terms Y1 thru Y7, the 3 X 5 MA would be equal to:
#First 5 is sum(Y1, Y2, Y3, Y4, Y5)
#Second 5 is sum(Y2, Y3, Y4, Y5, Y6)
#Third5 is sum(Y3, Y4, Y5, Y6, Y7)
#So TOTAL is 1/15*Y1 + 2/15*Y2 + 3/15*Y3 + 3/15*Y4 + 3/15*Y5 +2/15*Y6 + 1/15*Y7
#We know that 1/15 = 0.067, 2/15 = /0.133, 3/15=0.20.
#These are the corresponding weights to the provided 7-term weighted avg
#2: Plastics data set, monthly sales, in thousands, over 5 years, for a single product.
#A: autoplot of the times series seems to show a positve trend with seasonal peaks.
autoplot(plastics)

#B: Use multiplicative decomposition
plasmuldeco <- decompose(plastics, type=c("multiplicative"))
plot(plasmuldeco)

#C: yes, there is a clear linear positive trend shown (with tail deviations), as well as a rolling seasonal pattern.
#D: The linear trend appears, but the tail non-linearity is quite obvious.
plasmuldeco %>% seasadj() %>% autoplot(ylab = "Seasonally Adjusted")

autoplot(plastics, series="Data", xlab = "YEAR", ylab = "Sales") +
autolayer(trendcycle(plasmuldeco), series="Trend") +
autolayer(seasadj(plasmuldeco), series="Seasonally Adjusted")
## Warning: Removed 12 rows containing missing values (geom_path).

#E: When I change one item by 500 (point 35) the outlier appears, but not at the full 500 strength; it is buried a bit in the seasonal peak.
plasticsA <-plastics
plasticsA[35] <- plasticsA[35] + 500
plasmuldecoA <- decompose(plasticsA, type=c("multiplicative"))
plot(plasmuldecoA)

plasmuldecoA %>% seasadj() %>% autoplot(ylab = "Seasonally Adjusted with PLus 500")

#F: When I change one item by 500 (point 59) the outlier appears, and at almost full height of 500.
plasticsB <-plastics
plasticsB[59] <- plasticsA[59] + 500
plasmuldecoB <- decompose(plasticsB, type=c("multiplicative"))
plot(plasmuldecoB)

plasmuldecoB %>% seasadj() %>% autoplot(ylab = "Seasonally Adjusted with TAIL END PLus 500")

#3: Retail Sales Data
#read in Retail Data, pick an item
library(readxl)
retail <- read_excel("E:/WOODS/ADECXXXX/retail.xlsx", skip = 1)
head(retail)
## # A tibble: 6 x 190
## `Series ID` A3349335T A3349627V A3349338X A3349398A A3349468W
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1982-04-01 00:00:00 303. 41.7 63.9 409. 65.8
## 2 1982-05-01 00:00:00 298. 43.1 64 405. 65.8
## 3 1982-06-01 00:00:00 298 40.3 62.7 401 62.3
## 4 1982-07-01 00:00:00 308. 40.9 65.6 414. 68.2
## 5 1982-08-01 00:00:00 299. 42.1 62.6 404. 66
## 6 1982-09-01 00:00:00 305. 42 64.4 412. 62.3
## # ... with 184 more variables: A3349336V <dbl>, A3349337W <dbl>,
## # A3349397X <dbl>, A3349399C <dbl>, A3349874C <dbl>, A3349871W <dbl>,
## # A3349790V <dbl>, A3349556W <dbl>, A3349791W <dbl>, A3349401C <dbl>,
## # A3349873A <dbl>, A3349872X <dbl>, A3349709X <dbl>, A3349792X <dbl>,
## # A3349789K <dbl>, A3349555V <dbl>, A3349565X <dbl>, A3349414R <dbl>,
## # A3349799R <dbl>, A3349642T <dbl>, A3349413L <dbl>, A3349564W <dbl>,
## # A3349416V <dbl>, A3349643V <dbl>, A3349483V <dbl>, A3349722T <dbl>,
## # A3349727C <dbl>, A3349641R <dbl>, A3349639C <dbl>, A3349415T <dbl>,
## # A3349349F <dbl>, A3349563V <dbl>, A3349350R <dbl>, A3349640L <dbl>,
## # A3349566A <dbl>, A3349417W <dbl>, A3349352V <dbl>, A3349882C <dbl>,
## # A3349561R <dbl>, A3349883F <dbl>, A3349721R <dbl>, A3349478A <dbl>,
## # A3349637X <dbl>, A3349479C <dbl>, A3349797K <dbl>, A3349477X <dbl>,
## # A3349719C <dbl>, A3349884J <dbl>, A3349562T <dbl>, A3349348C <dbl>,
## # A3349480L <dbl>, A3349476W <dbl>, A3349881A <dbl>, A3349410F <dbl>,
## # A3349481R <dbl>, A3349718A <dbl>, A3349411J <dbl>, A3349638A <dbl>,
## # A3349654A <dbl>, A3349499L <dbl>, A3349902A <dbl>, A3349432V <dbl>,
## # A3349656F <dbl>, A3349361W <dbl>, A3349501L <dbl>, A3349503T <dbl>,
## # A3349360V <dbl>, A3349903C <dbl>, A3349905J <dbl>, A3349658K <dbl>,
## # A3349575C <dbl>, A3349428C <dbl>, A3349500K <dbl>, A3349577J <dbl>,
## # A3349433W <dbl>, A3349576F <dbl>, A3349574A <dbl>, A3349816F <dbl>,
## # A3349815C <dbl>, A3349744F <dbl>, A3349823C <dbl>, A3349508C <dbl>,
## # A3349742A <dbl>, A3349661X <dbl>, A3349660W <dbl>, A3349909T <dbl>,
## # A3349824F <dbl>, A3349507A <dbl>, A3349580W <dbl>, A3349825J <dbl>,
## # A3349434X <dbl>, A3349822A <dbl>, A3349821X <dbl>, A3349581X <dbl>,
## # A3349908R <dbl>, A3349743C <dbl>, A3349910A <dbl>, A3349435A <dbl>,
## # A3349365F <dbl>, A3349746K <dbl>, ...
retailts <- ts(retail[, "A3349627V"], frequency = 12, start = c(1982, 4))
#Decomposing using the X11 process, note two spikes in late 1980s and at 1990; relatively flat trend until 2000, then positive. Seasonal pattern is clear.
retaildeco <- retailts %>% seas(x11 = "")
autoplot(retaildeco, main = "X11 Decomposition of Retail Item A3349627V")

#4: Discuss decompostition in figure 6.16, page 181 of printed text.
#A: We can see in this decomposition that there is a strong positive linear trend with a flat area in 1990 range;
#there is a strong and complex seasonal pattern that increases in variabilty then stabilizes into a broad pattern;
#high levels of employment are seen in December year after year;
#the trend range is quite wide in comparison to the seasonal pattern, which is quite narrow relative to the trend increase.
#B: The recession of 1991/1992 shows in the flattening of the trend line, as well as the deep negative dips in the remainder panel.
#5: Cangas data set, monthly Canadian gas production, billions cubic meters, Jan-1960 thru Feb-2005.
#A: Overall, increase gas production over time. Flattens in late 70s - mid 80s (energy crisis I recall).
#Subseries plot also shows that flattening in each month during those years.
#The seasonal plot allows us to see that production was relatively flat in the beginning (1960s) but has become more volatile over time, within each year.
autoplot(cangas, ylab = "Billion Cubic Meters")

ggsubseriesplot(cangas, ylab = "Billion Cubic Meters")

ggseasonplot(cangas, ylab = "Billion Cubic Meters")

#B: STL seasonal component is far more stable than the other two decompositon methods; trend panel shows more volatility in SEATS and X11.
cangas %>% stl(t.window = 13, s.window="periodic", robust = TRUE) %>% autoplot(main = "STL DECOMP CANGAS")

cangas %>% seas() %>% autoplot(main = "SEATS DECOMP CANGAS")

cangas %>% seas(x11 = "") %>% autoplot(main = "X11 Decomposition of CANGAS")

#6: bricksq data, quarterly clay brick production, 1956 - 1994.
head(bricksq)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1956 189 204 208 197
## 1957 187 214
#A: Higher t values smooth the trendline; using the changing values stabilizes the seasonal width.
stlfix <- stl(bricksq, s.window = "periodic", robust = TRUE)
autoplot(stlfix, main = "STL FIXED for BRICK")

stlchange <- stl(bricksq, s.window = 13)
autoplot(stlchange, main = "STL CHANGE for BRICK")

#B: Compute and plot the seasonally adjusted forecast
changebrickfc <-forecast(stlchange)
changebrickfc
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1994 Q4 465.7326 437.4903 493.9748 422.5398 508.9254
## 1995 Q1 424.6727 384.7119 464.6335 363.5580 485.7874
## 1995 Q2 483.9914 435.0232 532.9595 409.1010 558.8817
## 1995 Q3 493.9988 437.4242 550.5734 407.4754 580.5222
## 1995 Q4 465.7326 402.4454 529.0198 368.9431 562.5220
## 1996 Q1 424.6727 355.3067 494.0387 318.5865 530.7589
## 1996 Q2 483.9914 409.0259 558.9568 369.3416 598.6411
## 1996 Q3 493.9988 413.8128 574.1848 371.3650 616.6326
autoplot(changebrickfc)

#C: Naive forecast seasonally adjusted data
stlfix %>% seasadj() %>% naive() %>% autoplot(main = "Naive Forecast Seasonally Adj Data")

#D: Re-seasonalize the data
brickfs <- stlf(bricksq)
autoplot(brickfs)

#E: The Ljung-Box is significant, so there is still autocorrelation. But the graphs show normal distributoin of residuals, the ACF graph is good (with one high negaitve lag), and the residuals do not show strong eviations from the assumptions.
checkresiduals(brickfs)
## Warning in checkresiduals(brickfs): The fitted degrees of freedom is based
## on the model used for the seasonally adjusted data.

##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 41.128, df = 6, p-value = 2.733e-07
##
## Model df: 2. Total lags used: 8
#F: Revise to a robust forecast;very little change in residual diagnostics.
brickfsrobust <- stlf(bricksq, robust = TRUE)
autoplot(brickfsrobust)

checkresiduals(brickfsrobust)
## Warning in checkresiduals(brickfsrobust): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.

##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 28.163, df = 6, p-value = 8.755e-05
##
## Model df: 2. Total lags used: 8
#G: The stlf() forcast is better, less variability is seen in the plot prediction intervals.
bricksA <- window(bricksq, end = c(1987, 4))
forecast1 <- stlf(bricksA, robust = TRUE)
forecast2 <- snaive(bricksA)
autoplot(forecast1)

autoplot(forecast2)

#7: Using the writing data, choose forecast.
head(writing)
## Jan Feb Mar Apr May Jun
## 1968 562.674 599.000 668.516 597.798 579.889 668.233
autoplot(writing)

# There is an upward trend in the data, with some seasonal drops.There is little difference seen between naive adn drift models.
writenaive <- stlf(writing, method = 'naive')
autoplot(writenaive)

writedrift = stlf(writing, method = 'rwdrift')
autoplot(writedrift)

#8: Using the fancy data, choose forecast.
head(fancy)
## Jan Feb Mar Apr May Jun
## 1987 1664.81 2397.53 2840.71 3547.29 3752.96 3714.74
autoplot(fancy)

# There is an upward trend in the data, with some seasonal drops.
#There is little difference seen between naive and drift models.
#There is a lot of variation change, so I will use a Box-Cox transform.
#The Box-cox lambda is quite close to 0 (.00213); I will use .1 as a moderate estimate.
fancynaive <- stlf(fancy, method = 'naive')
autoplot(fancynaive)

fancydrift = stlf(fancy, method = 'rwdrift')
autoplot(fancydrift)

BoxCox.lambda(fancy)
## [1] 0.002127317
fancylambda <- .1
fancyfinal<- stlf(fancy, method='rwdrift', lambda = fancylambda)
autoplot(fancyfinal, main = "STL Forecast, RW Drift with lambda= 0.1")
