HW1-Notes.utf8.md

library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ----------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(mosaic)

## Loading required package: lattice

## Loading required package: ggformula

## Loading required package: ggstance

## 
## Attaching package: 'ggstance'

## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh

## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")

## Loading required package: mosaicData

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following object is masked from 'package:purrr':
## 
##     cross

## The following object is masked from 'package:ggplot2':
## 
##     stat

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median,
##     prop.test, quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

library(ggformula)
library(readr)
library(fpp2)

## Loading required package: forecast

## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Registered S3 methods overwritten by 'forecast':
##   method             from    
##   fitted.fracdiff    fracdiff
##   residuals.fracdiff fracdiff

## Loading required package: fma

## Loading required package: expsmooth

###CHAPTER 5
##1. Daily Electricity Demand in Victoria, Australia, 2014
daily20 <- head(elecdaily, 20)
daily20

## Time Series:
## Start = c(1, 1) 
## End = c(3, 6) 
## Frequency = 7 
##            Demand WorkDay Temperature
## 1.000000 174.8963       0        26.0
## 1.142857 188.5909       1        23.0
## 1.285714 188.9169       1        22.2
## 1.428571 173.8142       0        20.3
## 1.571429 169.5152       0        26.1
## 1.714286 195.7288       1        19.6
## 1.857143 199.9029       1        20.0
## 2.000000 205.3375       1        27.4
## 2.142857 228.0782       1        32.4
## 2.285714 258.5984       1        34.0
## 2.428571 201.7970       0        22.4
## 2.571429 187.6298       0        22.5
## 2.714286 254.6636       1        30.0
## 2.857143 322.2323       1        42.4
## 3.000000 343.9934       1        41.5
## 3.142857 347.6376       1        43.2
## 3.285714 332.9455       1        43.1
## 3.428571 219.7517       0        23.7
## 3.571429 186.9816       0        22.3
## 3.714286 228.4876       1        24.0

autoplot(daily20)

#A: There is a positive relationship because at higher temps, higher demand, and at lower temps, lower demand.
daily20 %>% as.data.frame() %>% gf_point(Demand ~ Temperature, data = daily20)

tempmodel <- tslm(Demand ~ Temperature, data=daily20)
summary(tempmodel)

## 
## Call:
## tslm(formula = Demand ~ Temperature, data = daily20)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.060  -7.117  -1.437  17.484  27.102 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.2117    17.9915   2.179   0.0428 *  
## Temperature   6.7572     0.6114  11.052 1.88e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22 on 18 degrees of freedom
## Multiple R-squared:  0.8716, Adjusted R-squared:  0.8644 
## F-statistic: 122.1 on 1 and 18 DF,  p-value: 1.876e-09

#B: The high p-value in the Breusch-Godfrey test indicates there is no concern for autocorelation; the ACF is not worrisome.  The residual plot shows a steady linear growth pattern, indicating that the regression model is not apporpriate.
checkresiduals(tempmodel)

## 
##  Breusch-Godfrey test for serial correlation of order up to 5
## 
## data:  Residuals from Linear regression model
## LM test = 3.8079, df = 5, p-value = 0.5774

#C: The model prediction for 35 degrees is reasonable, but for 15 degrees it seems inapprporiate, follwing the trend line. 
foretemp <- forecast(tempmodel, newdata = data.frame(Temperature = c(15, 35)))
foretemp

##          Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 3.857143       140.5701 108.6810 172.4591  90.21166 190.9285
## 4.000000       275.7146 245.2278 306.2014 227.57056 323.8586

#D: The prediction intervals are in the output below, as well as shown in the plot bands.  
autoplot(foretemp, facets = TRUE)

foretemp

##          Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 3.857143       140.5701 108.6810 172.4591  90.21166 190.9285
## 4.000000       275.7146 245.2278 306.2014 227.57056 323.8586

#E: When we plot the entire data set, it is clear that this is NOT linear!
elecdaily %>% as.data.frame() %>% gf_point(Demand ~ Temperature, data = daily20) + geom_smooth(method='lm', se=FALSE)

##2: Mens 400 Meter Olympic winning times 896-2016.
#A:Winning times are generally trending lower, meaning faster, over time.
autoplot(mens400)

qplot(time(mens400), mens400)

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

## Warning: Removed 3 rows containing missing values (geom_point).

#B: Average decrease per year is 0.06457 seconds.
time400 <- time(mens400)
(menmodel <- tslm(mens400 ~ time400))

## 
## Call:
## tslm(formula = mens400 ~ time400)
## 
## Coefficients:
## (Intercept)      time400  
##   172.48148     -0.06457

#C: There is an outlier in the first data point (year 1896) and a noticeable curved trend around the 0 residual level; linear modelling is not a good fit for this data set.
qplot(time(mens400), residuals(menmodel))

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

## Warning: Removed 3 rows containing missing values (geom_point).

#D: In 2020, we expect a winning time of 42,04231 seconds; see output for 80 and 95 PIs. This assumes the trend holds linearly, which is unlikely! Current world record is 43.03 seconds!
forecast(menmodel, newdata = data.frame(time400=2020))

##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2020       42.04231 40.44975 43.63487 39.55286 44.53176

##3: Type Easter(ausbeer), what do we see? 
#This shows us which quarter has Easter in it; where partial, Easter is allocated across quarters since in countries that recognize Easter as a legal holiday, it is a few days long.
easter(ausbeer)

##      Qtr1 Qtr2 Qtr3 Qtr4
## 1956 0.67 0.33 0.00 0.00
## 1957 0.00 1.00 0.00 0.00
## 1958 0.00 1.00 0.00 0.00
## 1959 1.00 0.00 0.00 0.00
## 1960 0.00 1.00 0.00 0.00
## 1961 0.33 0.67 0.00 0.00
## 1962 0.00 1.00 0.00 0.00
## 1963 0.00 1.00 0.00 0.00
## 1964 1.00 0.00 0.00 0.00
## 1965 0.00 1.00 0.00 0.00
## 1966 0.00 1.00 0.00 0.00
## 1967 1.00 0.00 0.00 0.00
## 1968 0.00 1.00 0.00 0.00
## 1969 0.00 1.00 0.00 0.00
## 1970 1.00 0.00 0.00 0.00
## 1971 0.00 1.00 0.00 0.00
## 1972 0.33 0.67 0.00 0.00
## 1973 0.00 1.00 0.00 0.00
## 1974 0.00 1.00 0.00 0.00
## 1975 1.00 0.00 0.00 0.00
## 1976 0.00 1.00 0.00 0.00
## 1977 0.00 1.00 0.00 0.00
## 1978 1.00 0.00 0.00 0.00
## 1979 0.00 1.00 0.00 0.00
## 1980 0.00 1.00 0.00 0.00
## 1981 0.00 1.00 0.00 0.00
## 1982 0.00 1.00 0.00 0.00
## 1983 0.00 1.00 0.00 0.00
## 1984 0.00 1.00 0.00 0.00
## 1985 0.00 1.00 0.00 0.00
## 1986 1.00 0.00 0.00 0.00
## 1987 0.00 1.00 0.00 0.00
## 1988 0.00 1.00 0.00 0.00
## 1989 1.00 0.00 0.00 0.00
## 1990 0.00 1.00 0.00 0.00
## 1991 1.00 0.00 0.00 0.00
## 1992 0.00 1.00 0.00 0.00
## 1993 0.00 1.00 0.00 0.00
## 1994 0.00 1.00 0.00 0.00
## 1995 0.00 1.00 0.00 0.00
## 1996 0.00 1.00 0.00 0.00
## 1997 1.00 0.00 0.00 0.00
## 1998 0.00 1.00 0.00 0.00
## 1999 0.00 1.00 0.00 0.00
## 2000 0.00 1.00 0.00 0.00
## 2001 0.00 1.00 0.00 0.00
## 2002 1.00 0.00 0.00 0.00
## 2003 0.00 1.00 0.00 0.00
## 2004 0.00 1.00 0.00 0.00
## 2005 1.00 0.00 0.00 0.00
## 2006 0.00 1.00 0.00 0.00
## 2007 0.00 1.00 0.00 0.00
## 2008 1.00 0.00 0.00 0.00
## 2009 0.00 1.00 0.00 0.00
## 2010 0.00 1.00

##4: Elasticity Coefficient
#Elasticity coefficient shows the percentage change that will occur in Y when X changes 1 percent. It is the ratio (% change Y) / (% change X).  When you turn a linear model into a log-log model; meaning logY = INtercept + BetalogX, Beta will become the elasticity coefficient.

##5: Fancy data set, monthly sales of a shop starting January 1987. Spikes at Christmas and Surf Festival (March) expected.
#A: sales are increasing over time.  There are noticeable spikes in December and March, with December being about 4 times taller than March.
autoplot(fancy, xlab="Year", ylab="Sales")

#B: TO stabilize the variance we would take logs of the data.  In a plot, we see this improvement.
autoplot(log(fancy), xlab="Year", ylab="Log Sales")

#C: Formulate regression with seasonal variable and a dummy variable for surf festival.
surf <- cycle(fancy) == 3
surf[3] <- FALSE

salesmodel <- tslm(fancy ~ trend + season + surf, lambda = 0)
summary(salesmodel)

## 
## Call:
## tslm(formula = fancy ~ trend + season + surf, lambda = 0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33673 -0.12757  0.00257  0.10911  0.37671 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.6196670  0.0742471 102.626  < 2e-16 ***
## trend       0.0220198  0.0008268  26.634  < 2e-16 ***
## season2     0.2514168  0.0956790   2.628 0.010555 *  
## season3     0.2660828  0.1934044   1.376 0.173275    
## season4     0.3840535  0.0957075   4.013 0.000148 ***
## season5     0.4094870  0.0957325   4.277 5.88e-05 ***
## season6     0.4488283  0.0957647   4.687 1.33e-05 ***
## season7     0.6104545  0.0958039   6.372 1.71e-08 ***
## season8     0.5879644  0.0958503   6.134 4.53e-08 ***
## season9     0.6693299  0.0959037   6.979 1.36e-09 ***
## season10    0.7473919  0.0959643   7.788 4.48e-11 ***
## season11    1.2067479  0.0960319  12.566  < 2e-16 ***
## season12    1.9622412  0.0961066  20.417  < 2e-16 ***
## surfTRUE    0.5015151  0.1964273   2.553 0.012856 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 4.105e+10 on 13 and 70 DF,  p-value: < 2.2e-16

autoplot(fancy, ylab = "Sales", xlab = "Year") + autolayer(fitted(salesmodel), color = "maroon", lwd = 1,  series = "Fitted")

#D: The residual plot over time shows evidence of non-linearity, as it appears cyclical; the residuals to fitted is OK.
autoplot(residuals(salesmodel))

qplot(fitted(salesmodel), residuals(salesmodel))

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

#E: Boxplots panelled by month shows unequal variance across the months. 
month <- factor(cycle(residuals(salesmodel)), labels = month.abb)
ggplot() + geom_boxplot((aes(x=month, y = residuals(salesmodel), group=month)))

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

gf_boxplot(residuals(salesmodel) ~ month)

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

#F: The intercept is outside the range so uninterpretable; the trend coefficent shows that with each year, the log of sales increases by .02.
#December sales increase over November by log 1.96, on average, the highest value.  The surf "season" increases sales by log .50, on average.
coefficients(salesmodel)

## (Intercept)       trend     season2     season3     season4     season5 
##  7.61966701  0.02201983  0.25141682  0.26608280  0.38405351  0.40948697 
##     season6     season7     season8     season9    season10    season11 
##  0.44882828  0.61045453  0.58796443  0.66932985  0.74739195  1.20674790 
##    season12    surfTRUE 
##  1.96224123  0.50151509

#G: The B-G test is significant (p = 0.0025), indicating there is a likelyihood of serial corelation.
checkresiduals(salesmodel)

## 
##  Breusch-Godfrey test for serial correlation of order up to 17
## 
## data:  Residuals from Linear regression model
## LM test = 37.954, df = 17, p-value = 0.002494

#H: See graphical & summary output below.
future.surf <- rep(FALSE, 36)
future.surf[c(3, 15, 27)] <- TRUE
foresale <- forecast(salesmodel, h=36, newdata = data.frame(surf=future.surf))
foresale

##          Point Forecast     Lo 80     Hi 80      Lo 95     Hi 95
## Jan 1994       13244.70  10285.82  17054.73   8969.583  19557.43
## Feb 1994       17409.81  13520.45  22418.00  11790.284  25707.73
## Mar 1994       29821.65  23129.40  38450.24  20155.412  44123.68
## Apr 1994       20774.16  16133.21  26750.16  14068.696  30675.62
## May 1994       21783.73  16917.24  28050.15  14752.395  32166.37
## Jun 1994       23162.27  17987.81  29825.24  15685.969  34201.95
## Jul 1994       27831.56  21613.98  35837.72  18848.111  41096.73
## Aug 1994       27818.48  21603.82  35820.87  18839.249  41077.41
## Sep 1994       30848.42  23956.87  39722.43  20891.193  45551.50
## Oct 1994       34095.57  26478.61  43903.67  23090.230  50346.32
## Nov 1994       55176.84  42850.31  71049.28  37366.903  81475.41
## Dec 1994      120067.79  93244.59 154607.08  81312.400 177294.90
## Jan 1995       17250.40  13357.65  22277.59  11629.938  25587.08
## Feb 1995       22675.20  17558.28  29283.31  15287.252  33633.55
## Mar 1995       38840.85  30046.98  50208.44  26146.972  57697.39
## Apr 1995       27057.06  20951.33  34942.16  18241.435  40133.06
## May 1995       28371.96  21969.51  36640.25  19127.918  42083.42
## Jun 1995       30167.42  23359.80  38958.95  20338.387  44746.58
## Jul 1995       36248.88  28068.91  46812.70  24438.412  53767.06
## Aug 1995       36231.84  28055.72  46790.69  24426.922  53741.78
## Sep 1995       40178.16  31111.50  51887.06  27087.467  59595.26
## Oct 1995       44407.37  34386.35  57348.77  29938.733  65868.34
## Nov 1995       71864.42  55647.40  92807.48  48449.831 106594.69
## Dec 1995      156380.86 121091.75 201954.08 105429.448 231955.81
## Jan 1996       22467.57  17336.40  29117.46  15065.329  33506.86
## Feb 1996       29533.04  22788.25  38274.14  19802.984  44043.89
## Mar 1996       50587.81  39009.73  65602.25  33887.802  75517.62
## Apr 1996       35240.15  27191.96  45670.42  23629.808  52555.15
## May 1996       36952.72  28513.41  47889.88  24778.151  55109.18
## Jun 1996       39291.20  30317.82  50920.48  26346.183  58596.65
## Jul 1996       47211.93  36429.60  61185.57  31657.322  70409.18
## Aug 1996       47189.73  36412.48  61156.80  31642.439  70376.07
## Sep 1996       52329.57  40378.47  67817.91  35088.887  78041.33
## Oct 1996       57837.85  44628.77  74956.52  38782.394  86256.08
## Nov 1996       93598.96  72222.70 121302.09  62761.521 139588.15
## Dec 1996      203676.38 157160.50 263959.89 136572.460 303751.35

autoplot(foresale)

#I: transform completed as we used log in our time series model formulation.

#J: the model would be improved by addresseing the non-linearity issues in the trend line.

##6: Gasoline data set of weekly supply iof US gasoline, 2/2/1991 - 1/20/1997.
head(gasoline)

## Time Series:
## Start = 1991.1 
## End = 1991.19582477755 
## Frequency = 52.1785714285714 
## [1] 6.621 6.433 6.582 7.224 6.875 6.947

gas04 = window(gasoline, end=2004)
#A: the harmonic regression shows more density towards a linear model at higher levels of the fourier term.  fourier inputs are x=gas, K=# fourier terms, h=forecast horizon desired.
gasmod2 <- tslm(gas04~trend + fourier(gas04, K=2))
gasmod4 <- tslm(gas04~trend + fourier(gas04, K=4))
gasmod8 <- tslm(gas04~trend + fourier(gas04, K=8))
gasmod12 <- tslm(gas04~trend + fourier(gas04, K=12))

plot.ts(gas04, gasmod2$fitted.values)

plot.ts(gas04, gasmod4$fitted.values)

plot.ts(gas04, gasmod8$fitted.values)

plot.ts(gas04, gasmod12$fitted.values)

#B: Based on lowest CV or AIC, the best # of fourier terms is 8.  Based on adjusted R-square, it is also 8.
CV(gasmod2)

##            CV           AIC          AICc           BIC         AdjR2 
##  8.430123e-02 -1.664847e+03 -1.664679e+03 -1.633254e+03  8.010614e-01

CV(gasmod4)

##            CV           AIC          AICc           BIC         AdjR2 
##  7.983256e-02 -1.701660e+03 -1.701261e+03 -1.652014e+03  8.127370e-01

CV(gasmod8)

##            CV           AIC          AICc           BIC         AdjR2 
##     0.0747915 -1745.9534846 -1744.7914051 -1660.2021125     0.8266732

CV(gasmod12)

##            CV           AIC          AICc           BIC         AdjR2 
##  7.412822e-02 -1.752501e+03 -1.750161e+03 -1.630644e+03  8.303062e-01

#C: CHeck residuals - normality is good, but residual plot is not! And B-G test is significant, indicating serial correlation.
checkresiduals(gasmod8)

## 
##  Breusch-Godfrey test for serial correlation of order up to 104
## 
## data:  Residuals from Linear regression model
## LM test = 147.46, df = 104, p-value = 0.003284

#D: Forecast & plot future values
fc <- forecast(gasmod8, newdata = data.frame(fourier(gasoline, 8, 52))) 
autoplot(fc)

#E: The forecast mathces the data reasonably well.
#Plot along with actual 2005 data.
autoplot(fc) + autolayer(window(gasoline, start=2005, end=2005.99), series = "Actual 2005 Data")

##7:  Huron data set of water levels lake Huron, 1875 - 1972.
head(huron)

## Time Series:
## Start = 1875 
## End = 1880 
## Frequency = 1 
## [1] 10.38 11.86 10.97 10.80  9.79 10.39

#A: The water level has a generally downward trend right thru the early 20th century, then rises, and takes a stable level. 
autoplot(huron)

#B: The slopes for before/after 1915 are almost exactly opposite, indicating a flat trend post-1915.
huronmodel <- tslm(huron ~ trend)
summary(huronmodel)

## 
## Call:
## tslm(formula = huron ~ trend)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.50997 -0.72726  0.00083  0.74402  2.53565 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.202037   0.230111  44.335  < 2e-16 ***
## trend       -0.024201   0.004036  -5.996 3.55e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.13 on 96 degrees of freedom
## Multiple R-squared:  0.2725, Adjusted R-squared:  0.2649 
## F-statistic: 35.95 on 1 and 96 DF,  p-value: 3.545e-08

t <- time(huron)
t.break <- 1915
tb1 <- ts(pmax(0, t - t.break), start = 1875)
huronpw <- tslm(huron ~ t + tb1)
summary(huronpw)

## 
## Call:
## tslm(formula = huron ~ t + tb1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.49626 -0.66240 -0.07139  0.85163  2.39222 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 132.90870   19.97687   6.653 1.82e-09 ***
## t            -0.06498    0.01051  -6.181 1.58e-08 ***
## tb1           0.06486    0.01563   4.150 7.26e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.045 on 95 degrees of freedom
## Multiple R-squared:  0.3841, Adjusted R-squared:  0.3711 
## F-statistic: 29.62 on 2 and 95 DF,  p-value: 1.004e-10

#C: When we forecast 8 years, up to 1980, we see that the green Piecewise fits quite effectively and the linear model does not.
h <- 8
forelin <- forecast(huronmodel, h=h)
t.new <- t[length(t)] + seq(h)
tb1.new <- tb1[length(tb1)] +seq(h)
newdata <- cbind(t=t.new, tb1 = tb1.new) %>% as.data.frame()
forepw <- forecast(huronpw, newdata = newdata)

autoplot(huron) + 
  autolayer(fitted(huronmodel), series = "Linear Model") +
  autolayer(fitted(huronpw), series = "Piecewise") +
  autolayer(forelin, series = "Linear Model") +
  autolayer(forepw, series = "Piecewise Model")

###CHAPTER 6
library(seasonal)

## 
## Attaching package: 'seasonal'

## The following object is masked from 'package:mosaic':
## 
##     inspect

## The following object is masked from 'package:tibble':
## 
##     view

#1: Show that a 3 X 5 MA is equivalent to a 7-term wghted moving average
#If we had terms Y1 thru Y7, the 3 X 5 MA would be equal to:
#First 5 is   sum(Y1, Y2, Y3, Y4, Y5)
#Second 5 is  sum(Y2, Y3, Y4, Y5, Y6)
#Third5 is    sum(Y3, Y4, Y5, Y6, Y7)
#So TOTAL is 1/15*Y1 + 2/15*Y2 + 3/15*Y3 + 3/15*Y4 + 3/15*Y5 +2/15*Y6 + 1/15*Y7
#We know that 1/15 = 0.067, 2/15 = /0.133, 3/15=0.20.
#These are the corresponding weights to the provided 7-term weighted avg

#2: Plastics data set, monthly sales, in thousands, over 5 years, for a single product.
#A: autoplot of the times series seems to show a positve trend with seasonal peaks.
autoplot(plastics)

#B: Use multiplicative decomposition
plasmuldeco <- decompose(plastics, type=c("multiplicative"))
plot(plasmuldeco)

#C: yes, there is a clear linear positive trend shown (with tail deviations), as well as a rolling seasonal pattern.

#D: The linear trend appears, but the tail non-linearity is quite obvious.
plasmuldeco %>% seasadj() %>% autoplot(ylab = "Seasonally Adjusted")

autoplot(plastics, series="Data", xlab = "YEAR", ylab = "Sales") +
  autolayer(trendcycle(plasmuldeco), series="Trend") +
  autolayer(seasadj(plasmuldeco), series="Seasonally Adjusted")

## Warning: Removed 12 rows containing missing values (geom_path).

#E: When I change one item by 500 (point 35) the outlier appears, but not at the full 500 strength; it is buried a bit in the seasonal peak.
plasticsA <-plastics 
plasticsA[35] <- plasticsA[35] + 500
plasmuldecoA <- decompose(plasticsA, type=c("multiplicative"))
plot(plasmuldecoA)

plasmuldecoA %>% seasadj() %>% autoplot(ylab = "Seasonally Adjusted with PLus 500")

#F: When I change one item by 500 (point 59) the outlier appears, and at almost full height of 500.
plasticsB <-plastics 
plasticsB[59] <- plasticsA[59] + 500
plasmuldecoB <- decompose(plasticsB, type=c("multiplicative"))
plot(plasmuldecoB)

plasmuldecoB %>% seasadj() %>% autoplot(ylab = "Seasonally Adjusted with TAIL END PLus 500")

#3: Retail Sales Data
#read in Retail Data, pick an item
library(readxl)
retail <- read_excel("E:/WOODS/ADECXXXX/retail.xlsx", skip = 1)
head(retail)

## # A tibble: 6 x 190
##   `Series ID`         A3349335T A3349627V A3349338X A3349398A A3349468W
##   <dttm>                  <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 1982-04-01 00:00:00      303.      41.7      63.9      409.      65.8
## 2 1982-05-01 00:00:00      298.      43.1      64        405.      65.8
## 3 1982-06-01 00:00:00      298       40.3      62.7      401       62.3
## 4 1982-07-01 00:00:00      308.      40.9      65.6      414.      68.2
## 5 1982-08-01 00:00:00      299.      42.1      62.6      404.      66  
## 6 1982-09-01 00:00:00      305.      42        64.4      412.      62.3
## # ... with 184 more variables: A3349336V <dbl>, A3349337W <dbl>,
## #   A3349397X <dbl>, A3349399C <dbl>, A3349874C <dbl>, A3349871W <dbl>,
## #   A3349790V <dbl>, A3349556W <dbl>, A3349791W <dbl>, A3349401C <dbl>,
## #   A3349873A <dbl>, A3349872X <dbl>, A3349709X <dbl>, A3349792X <dbl>,
## #   A3349789K <dbl>, A3349555V <dbl>, A3349565X <dbl>, A3349414R <dbl>,
## #   A3349799R <dbl>, A3349642T <dbl>, A3349413L <dbl>, A3349564W <dbl>,
## #   A3349416V <dbl>, A3349643V <dbl>, A3349483V <dbl>, A3349722T <dbl>,
## #   A3349727C <dbl>, A3349641R <dbl>, A3349639C <dbl>, A3349415T <dbl>,
## #   A3349349F <dbl>, A3349563V <dbl>, A3349350R <dbl>, A3349640L <dbl>,
## #   A3349566A <dbl>, A3349417W <dbl>, A3349352V <dbl>, A3349882C <dbl>,
## #   A3349561R <dbl>, A3349883F <dbl>, A3349721R <dbl>, A3349478A <dbl>,
## #   A3349637X <dbl>, A3349479C <dbl>, A3349797K <dbl>, A3349477X <dbl>,
## #   A3349719C <dbl>, A3349884J <dbl>, A3349562T <dbl>, A3349348C <dbl>,
## #   A3349480L <dbl>, A3349476W <dbl>, A3349881A <dbl>, A3349410F <dbl>,
## #   A3349481R <dbl>, A3349718A <dbl>, A3349411J <dbl>, A3349638A <dbl>,
## #   A3349654A <dbl>, A3349499L <dbl>, A3349902A <dbl>, A3349432V <dbl>,
## #   A3349656F <dbl>, A3349361W <dbl>, A3349501L <dbl>, A3349503T <dbl>,
## #   A3349360V <dbl>, A3349903C <dbl>, A3349905J <dbl>, A3349658K <dbl>,
## #   A3349575C <dbl>, A3349428C <dbl>, A3349500K <dbl>, A3349577J <dbl>,
## #   A3349433W <dbl>, A3349576F <dbl>, A3349574A <dbl>, A3349816F <dbl>,
## #   A3349815C <dbl>, A3349744F <dbl>, A3349823C <dbl>, A3349508C <dbl>,
## #   A3349742A <dbl>, A3349661X <dbl>, A3349660W <dbl>, A3349909T <dbl>,
## #   A3349824F <dbl>, A3349507A <dbl>, A3349580W <dbl>, A3349825J <dbl>,
## #   A3349434X <dbl>, A3349822A <dbl>, A3349821X <dbl>, A3349581X <dbl>,
## #   A3349908R <dbl>, A3349743C <dbl>, A3349910A <dbl>, A3349435A <dbl>,
## #   A3349365F <dbl>, A3349746K <dbl>, ...

retailts <- ts(retail[, "A3349627V"], frequency = 12, start = c(1982, 4))

#Decomposing using the X11 process, note two spikes in late 1980s and at 1990; relatively flat trend until 2000, then positive. Seasonal pattern is clear.
retaildeco <- retailts %>% seas(x11 = "")
autoplot(retaildeco, main = "X11 Decomposition of Retail Item A3349627V")

#4: Discuss decompostition in figure 6.16, page 181 of printed text.
#A: We can see in this decomposition that there is a strong positive linear trend with a flat area in 1990 range;
#there is a strong and complex seasonal pattern that increases in variabilty then stabilizes into a broad pattern;
#high levels of employment are seen in December year after year;
#the trend range is quite wide in comparison to the seasonal pattern, which is quite narrow relative to the trend increase.

#B: The recession of 1991/1992 shows in the flattening of the trend line, as well as the deep negative dips in the remainder panel.

#5: Cangas data set, monthly Canadian gas production, billions cubic meters, Jan-1960 thru Feb-2005.

#A: Overall, increase gas production over time.  Flattens in late 70s - mid 80s (energy crisis I recall).
#Subseries plot also shows that flattening in each month during those years.
#The seasonal plot allows us to see that production was relatively flat in the beginning (1960s) but has become more volatile over time, within each year.
autoplot(cangas, ylab = "Billion Cubic Meters")

ggsubseriesplot(cangas, ylab = "Billion Cubic Meters")

ggseasonplot(cangas, ylab = "Billion Cubic Meters")

#B: STL seasonal component is far more stable than the other two decompositon methods; trend panel shows more volatility in SEATS and X11.
cangas %>% stl(t.window = 13, s.window="periodic", robust = TRUE) %>% autoplot(main = "STL DECOMP CANGAS")

cangas %>% seas() %>% autoplot(main = "SEATS DECOMP CANGAS")

cangas %>% seas(x11 = "") %>% autoplot(main = "X11 Decomposition of CANGAS")

#6: bricksq data, quarterly clay brick production, 1956 - 1994.
head(bricksq)

##      Qtr1 Qtr2 Qtr3 Qtr4
## 1956  189  204  208  197
## 1957  187  214

#A: Higher t values smooth the trendline; using the changing values stabilizes the seasonal width.
stlfix <- stl(bricksq, s.window = "periodic", robust = TRUE)
autoplot(stlfix, main = "STL FIXED for BRICK")

stlchange <- stl(bricksq, s.window = 13)
autoplot(stlchange, main = "STL CHANGE for BRICK")

#B: Compute and plot the seasonally adjusted forecast
changebrickfc <-forecast(stlchange)
changebrickfc

##         Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 1994 Q4       465.7326 437.4903 493.9748 422.5398 508.9254
## 1995 Q1       424.6727 384.7119 464.6335 363.5580 485.7874
## 1995 Q2       483.9914 435.0232 532.9595 409.1010 558.8817
## 1995 Q3       493.9988 437.4242 550.5734 407.4754 580.5222
## 1995 Q4       465.7326 402.4454 529.0198 368.9431 562.5220
## 1996 Q1       424.6727 355.3067 494.0387 318.5865 530.7589
## 1996 Q2       483.9914 409.0259 558.9568 369.3416 598.6411
## 1996 Q3       493.9988 413.8128 574.1848 371.3650 616.6326

autoplot(changebrickfc)

#C: Naive forecast seasonally adjusted data
stlfix %>% seasadj() %>% naive() %>% autoplot(main = "Naive Forecast Seasonally Adj Data")

#D: Re-seasonalize the data
brickfs <- stlf(bricksq)
autoplot(brickfs)

#E: The Ljung-Box is significant, so there is still autocorrelation. But the graphs show normal distributoin of residuals, the ACF graph is good (with one high negaitve lag), and the residuals do not show strong eviations from the assumptions.
checkresiduals(brickfs)

## Warning in checkresiduals(brickfs): The fitted degrees of freedom is based
## on the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(M,N,N)
## Q* = 41.128, df = 6, p-value = 2.733e-07
## 
## Model df: 2.   Total lags used: 8

#F: Revise to a robust forecast;very little change in residual diagnostics.
brickfsrobust <- stlf(bricksq, robust = TRUE)
autoplot(brickfsrobust)

checkresiduals(brickfsrobust)

## Warning in checkresiduals(brickfsrobust): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(M,N,N)
## Q* = 28.163, df = 6, p-value = 8.755e-05
## 
## Model df: 2.   Total lags used: 8

#G: The stlf() forcast is better, less variability is seen in the plot prediction intervals.
bricksA <- window(bricksq, end = c(1987, 4))
forecast1 <- stlf(bricksA, robust = TRUE)
forecast2 <- snaive(bricksA)
autoplot(forecast1)

autoplot(forecast2)

#7: Using the writing data, choose forecast.
head(writing)

##          Jan     Feb     Mar     Apr     May     Jun
## 1968 562.674 599.000 668.516 597.798 579.889 668.233

autoplot(writing)

# There is an upward trend in the data, with some seasonal drops.There is little difference seen between naive adn drift models.
writenaive <- stlf(writing, method = 'naive')
autoplot(writenaive)

writedrift = stlf(writing, method = 'rwdrift')
autoplot(writedrift)

#8: Using the fancy data, choose forecast.
head(fancy)

##          Jan     Feb     Mar     Apr     May     Jun
## 1987 1664.81 2397.53 2840.71 3547.29 3752.96 3714.74

autoplot(fancy)

# There is an upward trend in the data, with some seasonal drops.
#There is little difference seen between naive and drift models.
#There is a lot of variation change, so I will use a Box-Cox transform.
#The Box-cox lambda is quite close to 0 (.00213); I will use .1 as a moderate estimate.
fancynaive <- stlf(fancy, method = 'naive')
autoplot(fancynaive)

fancydrift = stlf(fancy, method = 'rwdrift')
autoplot(fancydrift)

BoxCox.lambda(fancy)

## [1] 0.002127317

fancylambda <- .1
fancyfinal<- stlf(fancy, method='rwdrift', lambda = fancylambda)
autoplot(fancyfinal, main = "STL Forecast, RW Drift with lambda= 0.1")

HW1-Notes.R

boardmal

2019-11-10