library(forecast)
## Warning: package 'forecast' was built under R version 4.0.4
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggplot2)
library(TTR)
## Warning: package 'TTR' was built under R version 4.0.4
library(stats)
library(readr)
library(fpp2)
## Warning: package 'fpp2' was built under R version 4.0.5
## -- Attaching packages ---------------------------------------------- fpp2 2.4 --
## v fma 2.4 v expsmooth 2.3
## Warning: package 'fma' was built under R version 4.0.5
## Warning: package 'expsmooth' was built under R version 4.0.5
##
library(seasonal)
## Warning: package 'seasonal' was built under R version 4.0.5
daily20 <- head(elecdaily, 20)
summary(daily20)
## Demand WorkDay Temperature
## Min. :169.5 Min. :0.00 Min. :19.60
## 1st Qu.:188.4 1st Qu.:0.00 1st Qu.:22.38
## Median :203.6 Median :1.00 Median :25.00
## Mean :230.5 Mean :0.65 Mean :28.30
## 3rd Qu.:255.6 3rd Qu.:1.00 3rd Qu.:32.80
## Max. :347.6 Max. :1.00 Max. :43.20
autoplot(daily20)
elecdailylm <- tslm(Demand ~ Temperature, data = daily20)
summary(elecdailylm)
##
## Call:
## tslm(formula = Demand ~ Temperature, data = daily20)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.060 -7.117 -1.437 17.484 27.102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.2117 17.9915 2.179 0.0428 *
## Temperature 6.7572 0.6114 11.052 1.88e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22 on 18 degrees of freedom
## Multiple R-squared: 0.8716, Adjusted R-squared: 0.8644
## F-statistic: 122.1 on 1 and 18 DF, p-value: 1.876e-09
There is a positive relationship because as the temperature goes up, demand goes up (ie. air conditioning)
checkresiduals(elecdailylm)
##
## Breusch-Godfrey test for serial correlation of order up to 5
##
## data: Residuals from Linear regression model
## LM test = 3.8079, df = 5, p-value = 0.5774
No notable lags, some heteroskedasticity and non normal residuals but Breusch Godfrey test p value large so predictions should be ok
tempforecast <- data.frame(Temperature = c(15,35))
elecdailyfc <- forecast(elecdailylm, newdata = tempforecast)
summary(elecdailyfc)
##
## Forecast method: Linear regression model
##
## Model Information:
##
## Call:
## tslm(formula = Demand ~ Temperature, data = daily20)
##
## Coefficients:
## (Intercept) Temperature
## 39.212 6.757
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 4.263603e-15 20.87457 16.44449 -0.8271304 7.763681 0.2872158
## ACF1
## Training set 0.0332374
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 4.285714 140.5701 108.6810 172.4591 90.21166 190.9285
## 4.428571 275.7146 245.2278 306.2014 227.57056 323.8586
Both these values fall within the minimum and maximum of daily20 so I believe them
autoplot(daily20, facets=TRUE)
daily20 %>%
as.data.frame() %>%
ggplot(aes(x=Temperature, y=Demand)) +
geom_point() +
geom_smooth(method="lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
fit <- tslm(Demand ~ Temperature, data=daily20)
checkresiduals(fit)
##
## Breusch-Godfrey test for serial correlation of order up to 5
##
## data: Residuals from Linear regression model
## LM test = 3.8079, df = 5, p-value = 0.5774
forecast(fit, newdata=data.frame(Temperature=c(15,35)))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 4.285714 140.5701 108.6810 172.4591 90.21166 190.9285
## 4.428571 275.7146 245.2278 306.2014 227.57056 323.8586
elecdailyfc$lower
## Time Series:
## Start = c(4, 3)
## End = c(4, 4)
## Frequency = 7
## [,1] [,2]
## 4.285714 108.6810 90.21166
## 4.428571 245.2278 227.57056
elecdailyfc$upper
## Time Series:
## Start = c(4, 3)
## End = c(4, 4)
## Frequency = 7
## [,1] [,2]
## 4.285714 172.4591 190.9285
## 4.428571 306.2014 323.8586
95% CI is a larger range than 80% CI so the first column of lower and upper intervals correspond to 80 and second column corresponds to 95
elecdaily %>%
as.data.frame() %>%
ggplot(aes(x=Temperature, y=Demand)) +
geom_point()
Our previous model only captures a section of the total data
summary(mens400)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 43.03 43.92 45.00 46.02 47.65 54.20 3
autoplot(mens400)
The winning time has followed a downward trend, there are also years missing from presumably wars or other conflicts which prevented the olympics
tmens400 <- time(mens400)
mens400lm <- tslm(mens400 ~ tmens400, data = mens400)
summary(mens400lm)
##
## Call:
## tslm(formula = mens400 ~ tmens400, data = mens400)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6002 -0.5747 -0.2858 0.5751 4.1505
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 172.481477 11.487522 15.02 2.52e-14 ***
## tmens400 -0.064574 0.005865 -11.01 2.75e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.136 on 26 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.8234, Adjusted R-squared: 0.8166
## F-statistic: 121.2 on 1 and 26 DF, p-value: 2.752e-11
Average decrease of 0.06 seconds per year
cbind(Year = tmens400, Residuals = mens400lm$residuals) %>%
as.data.frame() %>%
ggplot(aes(x=Year, y=Residuals)) +
geom_point()
## Warning: Removed 3 rows containing missing values (geom_point).
There is a better, non linear fit than the standard linear regression model
forecastmens <- data.frame(tmens400 = 2020)
fcmens400 <- forecast(mens400lm, newdata = forecastmens)
summary(fcmens400)
##
## Forecast method: Linear regression model
##
## Model Information:
##
## Call:
## tslm(formula = mens400 ~ tmens400, data = mens400)
##
## Coefficients:
## (Intercept) tmens400
## 172.48148 -0.06457
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -2.538059e-16 1.094591 0.7839455 -0.04728455 1.667093 0.01703555
## ACF1
## Training set 0.1461939
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2020 42.04231 40.44975 43.63487 39.55286 44.53176
Prediction interval above for 80 and 95 confidence interval. We have assumed normal distribution to recieve confidence intervals
easter(ausbeer)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1956 0.67 0.33 0.00 0.00
## 1957 0.00 1.00 0.00 0.00
## 1958 0.00 1.00 0.00 0.00
## 1959 1.00 0.00 0.00 0.00
## 1960 0.00 1.00 0.00 0.00
## 1961 0.33 0.67 0.00 0.00
## 1962 0.00 1.00 0.00 0.00
## 1963 0.00 1.00 0.00 0.00
## 1964 1.00 0.00 0.00 0.00
## 1965 0.00 1.00 0.00 0.00
## 1966 0.00 1.00 0.00 0.00
## 1967 1.00 0.00 0.00 0.00
## 1968 0.00 1.00 0.00 0.00
## 1969 0.00 1.00 0.00 0.00
## 1970 1.00 0.00 0.00 0.00
## 1971 0.00 1.00 0.00 0.00
## 1972 0.33 0.67 0.00 0.00
## 1973 0.00 1.00 0.00 0.00
## 1974 0.00 1.00 0.00 0.00
## 1975 1.00 0.00 0.00 0.00
## 1976 0.00 1.00 0.00 0.00
## 1977 0.00 1.00 0.00 0.00
## 1978 1.00 0.00 0.00 0.00
## 1979 0.00 1.00 0.00 0.00
## 1980 0.00 1.00 0.00 0.00
## 1981 0.00 1.00 0.00 0.00
## 1982 0.00 1.00 0.00 0.00
## 1983 0.00 1.00 0.00 0.00
## 1984 0.00 1.00 0.00 0.00
## 1985 0.00 1.00 0.00 0.00
## 1986 1.00 0.00 0.00 0.00
## 1987 0.00 1.00 0.00 0.00
## 1988 0.00 1.00 0.00 0.00
## 1989 1.00 0.00 0.00 0.00
## 1990 0.00 1.00 0.00 0.00
## 1991 1.00 0.00 0.00 0.00
## 1992 0.00 1.00 0.00 0.00
## 1993 0.00 1.00 0.00 0.00
## 1994 0.00 1.00 0.00 0.00
## 1995 0.00 1.00 0.00 0.00
## 1996 0.00 1.00 0.00 0.00
## 1997 1.00 0.00 0.00 0.00
## 1998 0.00 1.00 0.00 0.00
## 1999 0.00 1.00 0.00 0.00
## 2000 0.00 1.00 0.00 0.00
## 2001 0.00 1.00 0.00 0.00
## 2002 1.00 0.00 0.00 0.00
## 2003 0.00 1.00 0.00 0.00
## 2004 0.00 1.00 0.00 0.00
## 2005 1.00 0.00 0.00 0.00
## 2006 0.00 1.00 0.00 0.00
## 2007 0.00 1.00 0.00 0.00
## 2008 1.00 0.00 0.00 0.00
## 2009 0.00 1.00 0.00 0.00
## 2010 0.00 1.00
Easter gives us 0, 1 or fractional on when easter is. This is a dummy variable
tsfancy <- ts(fancy)
autoplot(tsfancy)
Very clear seasonal trend in the data and as business expands, the variation does as well
In this case, logarithms would be useful for us to analyze the percentage change in sales. It would also result in constant scale and help to some degree the heteroskedasticity that would be present in our analysis otherwise
summary(fancy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1665 5884 8772 14316 16889 104661
lambda <- BoxCox.lambda(fancy)
fancylm <- tslm(BoxCox(fancy, 0) ~ season + trend)
summary(fancylm)
##
## Call:
## tslm(formula = BoxCox(fancy, 0) ~ season + trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41644 -0.12619 0.00608 0.11389 0.38567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6058604 0.0768740 98.939 < 2e-16 ***
## season2 0.2510437 0.0993278 2.527 0.013718 *
## season3 0.6952066 0.0993386 6.998 1.18e-09 ***
## season4 0.3829341 0.0993565 3.854 0.000252 ***
## season5 0.4079944 0.0993817 4.105 0.000106 ***
## season6 0.4469625 0.0994140 4.496 2.63e-05 ***
## season7 0.6082156 0.0994534 6.116 4.69e-08 ***
## season8 0.5853524 0.0995001 5.883 1.21e-07 ***
## season9 0.6663446 0.0995538 6.693 4.27e-09 ***
## season10 0.7440336 0.0996148 7.469 1.61e-10 ***
## season11 1.2030164 0.0996828 12.068 < 2e-16 ***
## season12 1.9581366 0.0997579 19.629 < 2e-16 ***
## trend 0.0223930 0.0008448 26.508 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1858 on 71 degrees of freedom
## Multiple R-squared: 0.9527, Adjusted R-squared: 0.9447
## F-statistic: 119.1 on 12 and 71 DF, p-value: < 2.2e-16
tfancy <- time(fancy)
cbind(Time = tfancy, Residuals = fancylm$residuals) %>%
as.data.frame() %>%
ggplot(aes(x=Time, y=Residuals)) +
geom_point()
Very clear non random trend in data, these are most likely due to seasonal trends
boxplot(fancylm$residuals ~ cycle(fancylm$residuals))
Non-constant variance is present in the model which means that our model is not the best one.
Those coefficients indicate if there is a positive or negative change in sales associated with that month. They cannot tell us exact percentage change only direction
checkresiduals(fancylm)
##
## Breusch-Godfrey test for serial correlation of order up to 17
##
## data: Residuals from Linear regression model
## LM test = 37.152, df = 17, p-value = 0.003209
Breusch Godfrey test has significant p-value which tells us that there is significant autocorrelation and that forecasts would be inaccurate
tempforecast <- data.frame(Temperature = c(15,35)) elecdailyfc <- forecast(elecdailylm, newdata = tempforecast) summary(elecdailyfc)
fcfancy <- rep(0, 36)
tsforecast <- ts(fcfancy, start = 1994, end = c(1996, 12), frequency = 12)
saleforecast <- forecast(fancylm, newdata = tsforecast)
## Warning in forecast.lm(fancylm, newdata = tsforecast): newdata column names not
## specified, defaulting to first variable required.
autoplot(saleforecast)
backtransform <- as.data.frame(saleforecast)
exp(backtransform)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 1994 13484.06 10373.35 17527.60 9000.202 20201.76
## Feb 1994 17724.45 13635.50 23039.57 11830.533 26554.69
## Mar 1994 28261.51 21741.71 36736.44 18863.703 42341.27
## Apr 1994 21149.61 16270.49 27491.85 14116.722 31686.25
## May 1994 22177.42 17061.19 28827.88 14802.755 33226.11
## Jun 1994 23580.87 18140.87 30652.19 15739.516 35328.75
## Jul 1994 28334.55 21797.90 36831.38 18912.452 42450.69
## Aug 1994 28321.23 21787.65 36814.06 18903.560 42430.74
## Sep 1994 31405.93 24160.73 40823.80 20962.508 47052.23
## Oct 1994 34711.77 26703.92 45120.97 23169.053 52005.01
## Nov 1994 56174.03 43214.94 73019.23 37494.462 84159.68
## Dec 1994 122237.73 94038.05 158893.80 81589.975 183136.01
## Jan 1995 17640.97 13531.52 22998.44 11721.681 26549.43
## Feb 1995 23188.60 17786.83 30230.86 15407.845 34898.53
## Mar 1995 36974.06 28360.98 48202.90 24567.703 55645.47
## Apr 1995 27669.68 21224.05 36072.82 18385.332 41642.49
## May 1995 29014.35 22255.47 37825.85 19278.808 43666.20
## Jun 1995 30850.46 23663.86 40219.58 20498.826 46429.53
## Jul 1995 37069.62 28434.27 48327.47 24631.193 55789.28
## Aug 1995 37052.19 28420.91 48304.75 24619.613 55763.05
## Sep 1995 41087.86 31516.47 53566.03 27301.145 61836.67
## Oct 1995 45412.83 34833.94 59204.47 30174.903 68345.69
## Nov 1995 73491.54 56371.74 95810.55 48832.025 110603.79
## Dec 1995 159921.57 122667.95 208488.92 106261.125 240679.82
## Jan 1996 23079.39 17640.46 30195.26 15251.766 34924.36
## Feb 1996 30337.26 23187.92 39690.89 20048.051 45907.16
## Mar 1996 48372.55 36972.98 63286.85 31966.480 73198.66
## Apr 1996 36199.78 27668.87 47360.95 23922.233 54778.49
## May 1996 37958.98 29013.50 49662.56 25084.787 57440.57
## Jun 1996 40361.14 30849.55 52805.34 26672.224 61075.57
## Jul 1996 48497.56 37068.53 63450.40 32049.090 73387.82
## Aug 1996 48474.75 37051.10 63420.57 32034.022 73353.32
## Sep 1996 53754.55 41086.65 70328.24 35523.121 81342.86
## Oct 1996 59412.84 45411.50 77731.09 39262.336 89905.12
## Nov 1996 96147.75 73489.39 125792.17 63538.212 145493.40
## Dec 1996 209222.70 159916.89 273730.57 138262.581 316601.49
The first column is the point predictions, the next two are the lower and upper 80 confidence intervals and the last two columns consist of the lower and upper 95 confidence interval predictions
gaswindow <- window(gasoline, end = 2005)
autoplot(gaswindow)
gaslm1 <- tslm(gaswindow ~ trend + fourier(gaswindow, K=1))
gaslm2 <- tslm(gaswindow ~ trend + fourier(gaswindow, K=2))
gaslm5 <- tslm(gaswindow ~ trend + fourier(gaswindow, K=5))
gaslm10 <- tslm(gaswindow ~ trend + fourier(gaswindow, K=10))
autoplot(gaswindow) +
autolayer(gaslm1$fitted.values, series = "Fourier transform K = 1")
autoplot(gaswindow) +
autolayer(gaslm2$fitted.values, series = "Fourier transform K = 2")
autoplot(gaswindow) +
autolayer(gaslm5$fitted.values, series = "Fourier transform K = 5")
autoplot(gaswindow) +
autolayer(gaslm10$fitted.values, series = "Fourier transform K = 10")
We notice that the higher the value of K, the more nuanced the data becomes
CV(gaslm1)
## CV AIC AICc BIC AdjR2
## 8.201921e-02 -1.813292e+03 -1.813208e+03 -1.790354e+03 8.250858e-01
CV(gaslm2)
## CV AIC AICc BIC AdjR2
## 8.136854e-02 -1.819113e+03 -1.818957e+03 -1.787001e+03 8.269569e-01
CV(gaslm5)
## CV AIC AICc BIC AdjR2
## 7.553646e-02 -1.873234e+03 -1.872723e+03 -1.813596e+03 8.406928e-01
CV(gaslm10)
## CV AIC AICc BIC AdjR2
## 7.135754e-02 -1.915014e+03 -1.913441e+03 -1.809500e+03 8.516102e-01
We notice that as our K increases, the AICc becomes smaller. CV values are comparable across all 4 so therefore the higher K the better. The book also says that K has a maximum value of m/2, where our m is 52 for weekly data. I have tried another model where K = 26 to compare
gaslm26 <- tslm(gaswindow ~ trend + fourier(gaswindow, K=26))
CV(gaslm26)
## CV AIC AICc BIC AdjR2
## 7.426757e-02 -1.889694e+03 -1.880500e+03 -1.637379e+03 8.526098e-01
Our AICc value at K = 10 performed better so i will use the model where fourier term K = 10
checkresiduals(gaslm10)
##
## Breusch-Godfrey test for serial correlation of order up to 104
##
## data: Residuals from Linear regression model
## LM test = 155.45, df = 104, p-value = 0.0008135
fc <- forecast(gaslm10, newdata=data.frame(fourier(gaswindow,K = 10,52)))
summary(fc)
##
## Forecast method: Linear regression model
##
## Model Information:
##
## Call:
## tslm(formula = gaswindow ~ trend + fourier(gaswindow, K = 10))
##
## Coefficients:
## (Intercept) trend
## 7.092972 0.002799
## fourier(gaswindow, K = 10)S1-52 fourier(gaswindow, K = 10)C1-52
## 0.005822 -0.273068
## fourier(gaswindow, K = 10)S2-52 fourier(gaswindow, K = 10)C2-52
## -0.042246 -0.019506
## fourier(gaswindow, K = 10)S3-52 fourier(gaswindow, K = 10)C3-52
## 0.023718 -0.099742
## fourier(gaswindow, K = 10)S4-52 fourier(gaswindow, K = 10)C4-52
## 0.016751 -0.028523
## fourier(gaswindow, K = 10)S5-52 fourier(gaswindow, K = 10)C5-52
## 0.003452 -0.051467
## fourier(gaswindow, K = 10)S6-52 fourier(gaswindow, K = 10)C6-52
## 0.062397 -0.051832
## fourier(gaswindow, K = 10)S7-52 fourier(gaswindow, K = 10)C7-52
## 0.031550 0.043259
## fourier(gaswindow, K = 10)S8-52 fourier(gaswindow, K = 10)C8-52
## 0.021197 0.018102
## fourier(gaswindow, K = 10)S9-52 fourier(gaswindow, K = 10)C9-52
## -0.017916 0.013770
## fourier(gaswindow, K = 10)S10-52 fourier(gaswindow, K = 10)C10-52
## -0.017268 0.030860
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1.202927e-18 0.259095 0.2055967 -0.1072588 2.573184 0.6279583
## ACF1
## Training set -0.1261056
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2005.014 8.745595 8.402358 9.088832 8.220249 9.270941
## 2005.033 8.604075 8.260737 8.947413 8.078574 9.129575
## 2005.052 8.607195 8.263839 8.950551 8.081666 9.132724
## 2005.071 8.683215 8.339879 9.026550 8.157718 9.208712
## 2005.090 8.746645 8.403404 9.089887 8.221293 9.271998
## 2005.110 8.783003 8.439850 9.126156 8.257785 9.308221
## 2005.129 8.833517 8.490370 9.176664 8.308309 9.358725
## 2005.148 8.917498 8.574348 9.260647 8.392285 9.442710
## 2005.167 8.997861 8.654729 9.340992 8.472676 9.523046
## 2005.186 9.029128 8.686005 9.372252 8.503956 9.554301
## 2005.205 9.018507 8.675381 9.361633 8.493330 9.543684
## 2005.225 9.017572 8.674439 9.360705 8.492385 9.542760
## 2005.244 9.056052 8.712918 9.399187 8.530863 9.581242
## 2005.263 9.105523 8.762395 9.448651 8.580344 9.630703
## 2005.282 9.121048 8.777925 9.464172 8.595876 9.646221
## 2005.301 9.105879 8.762754 9.449005 8.580703 9.631055
## 2005.320 9.112513 8.769382 9.455644 8.587329 9.637697
## 2005.340 9.175080 8.831948 9.518211 8.649895 9.700265
## 2005.359 9.259000 8.915873 9.602127 8.733822 9.784178
## 2005.378 9.296291 8.953167 9.639415 8.771118 9.821464
## 2005.397 9.268208 8.925083 9.611334 8.743032 9.793385
## 2005.416 9.234527 8.891397 9.577657 8.709345 9.759710
## 2005.435 9.270246 8.927115 9.613376 8.745062 9.795429
## 2005.455 9.381554 9.038427 9.724681 8.856376 9.906731
## 2005.474 9.497887 9.154762 9.841011 8.972713 10.023060
## 2005.493 9.546877 9.203751 9.890002 9.021700 10.072053
## 2005.512 9.524722 9.181592 9.867851 8.999539 10.049904
## 2005.531 9.487091 9.143960 9.830221 8.961908 10.012274
## 2005.550 9.482387 9.139260 9.825513 8.957209 10.007564
## 2005.570 9.509124 9.165999 9.852248 8.983950 10.034297
## 2005.589 9.536544 9.193418 9.879670 9.011367 10.061720
## 2005.608 9.546848 9.203718 9.889978 9.021665 10.072030
## 2005.627 9.541950 9.198820 9.885081 9.016767 10.067133
## 2005.646 9.517407 9.174280 9.860533 8.992229 10.042584
## 2005.665 9.453438 9.110314 9.796562 8.928265 9.978612
## 2005.685 9.344405 9.001279 9.687531 8.819228 9.869581
## 2005.704 9.226892 8.883762 9.570023 8.701709 9.752076
## 2005.723 9.160315 8.817184 9.503446 8.635131 9.685499
## 2005.742 9.174078 8.830951 9.517205 8.648901 9.699255
## 2005.761 9.241612 8.898488 9.584736 8.716439 9.766785
## 2005.780 9.310432 8.967306 9.653559 8.785255 9.835609
## 2005.800 9.348790 9.005658 9.691922 8.823604 9.873976
## 2005.819 9.354217 9.011085 9.697350 8.829031 9.879404
## 2005.838 9.326591 8.983464 9.669718 8.801413 9.851768
## 2005.857 9.258087 8.914964 9.601211 8.732915 9.783260
## 2005.876 9.163285 8.820158 9.506412 8.638107 9.688463
## 2005.895 9.102531 8.759394 9.445669 8.577338 9.627725
## 2005.915 9.142445 8.799307 9.485583 8.617250 9.667640
## 2005.934 9.276104 8.932975 9.619234 8.750923 9.801286
## 2005.953 9.396214 9.053089 9.739340 8.871039 9.921390
## 2005.972 9.374939 9.031803 9.718075 8.849747 9.900131
## 2005.991 9.184286 8.841055 9.527518 8.658948 9.709624
autoplot(fc, start = 2005) +
autolayer(window(gasoline, start = 2004, end = 2006)) +
xlim(2004, 2006)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 674 row(s) containing missing values (geom_path).
Our forecast was able to follow the general trend of the real data but not match the nuance. Perhaps a better K value could be chosen in the future to make these predictions even better
summary(huron)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.960 8.135 9.120 9.004 9.875 11.860
autoplot(huron)
The data is a yearly measurment of lake Huron’s depth in feet (reduced by 570 feet? So 12 ft on the chart would represent 582).
thuron <- time(huron)
huronlm <- tslm(huron ~ trend)
summary(huronlm)
##
## Call:
## tslm(formula = huron ~ trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.50997 -0.72726 0.00083 0.74402 2.53565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.202037 0.230111 44.335 < 2e-16 ***
## trend -0.024201 0.004036 -5.996 3.55e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.13 on 96 degrees of freedom
## Multiple R-squared: 0.2725, Adjusted R-squared: 0.2649
## F-statistic: 35.95 on 1 and 96 DF, p-value: 3.545e-08
t.break <- 1915
tb <- ts(pmax(0, thuron - t.break), start = 1875)
huronpw <- tslm(huron ~ thuron + tb)
summary(huronpw)
##
## Call:
## tslm(formula = huron ~ thuron + tb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.49626 -0.66240 -0.07139 0.85163 2.39222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 132.90870 19.97687 6.653 1.82e-09 ***
## thuron -0.06498 0.01051 -6.181 1.58e-08 ***
## tb 0.06486 0.01563 4.150 7.26e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.045 on 95 degrees of freedom
## Multiple R-squared: 0.3841, Adjusted R-squared: 0.3711
## F-statistic: 29.62 on 2 and 95 DF, p-value: 1.004e-10
The difference is that after 1915 on the piecewise function the slope becomes positive
h = 8
forecastlm <- forecast(huronlm, h = 8)
autoplot(forecastlm)
thuron.new <- thuron[length(thuron)] + seq(h)
tb.new <- tb[length(tb)] + seq(h)
newdata <- cbind(thuron=thuron.new,
tb=tb.new) %>%
as.data.frame()
forecastpw <- forecast(huronpw,newdata = newdata)
autoplot(forecastpw)
Looking above, the first prediction (linear) predicts a continual downwards trend for the next 8 years. However, the piecewise function predictions follow a flat line trend.
CHAPTER 6
Consider a 3 x 5-MA as first taking a 5-MA and then followed by a 3-MA:
We then have the first term as (Y1 + Y2 + Y3 + Y4 + Y5)/5, second term as (Y2 + Y3 + Y4 + Y5 + Y6)/5 … final term as (Y5 + Y6 + Y7 + Y8+ Y9 + Y10)/5
Then we consider the 3-MA to consist of ((Y1 + Y2 + Y3 + Y4 + Y5/5) + … + Y6 + Y7/5)/3).
We see that Y1 and Y7 have no repeats which gives us 1/15[Y1 + Y7] Y2 and Y6 are repeated twice which gives us 2/15[Y2 + Y6] Y3, Y4 and Y5 are repeated 3 times each so that is 3/15 or 1/5[Y3 + Y4 + Y5] If we then replace Y1,…Y7 with 1: We can see that the weights come out to 0.067, 0.133, 0.200, 0.200, 0.200, 0.133, and 0.067
autoplot(plastics)
We can see a clear upwards trend as well as a seasonal trend like a sine wave with a period of 1 year
mplastics <- decompose(plastics, type = "multiplicative")
autoplot(mplastics)
Yes, both the upwards trend and sine wave like seasonal trend are present
autoplot(seasadj(mplastics))
plasticdata <- plastics
plasticdata[6] <- plasticdata[6] + 500
mplasticdata <- decompose(plasticdata, type = "multiplicative")
autoplot(seasadj(mplasticdata))
The outlier causes a huge spike where it is located at, as expected. Interestingly enough, the resulting adjusted peak isnt as high as I expected because the observation that was one that was taking during a seasonal high anyways.
plasticdata[6] <- plasticdata[6] - 500
plasticdata[30] <- plasticdata[30] + 500
mplasticdata2 <- decompose(plasticdata, type = "multiplicative")
autoplot(seasadj(mplasticdata2))
plasticdata[30] <- plasticdata[30] - 500
plasticdata[60] <- plasticdata[60] + 500
mplasticdata3 <- decompose(plasticdata, type = "multiplicative")
autoplot(seasadj(mplasticdata3))
Yes, the time of year and seasonality affects the resulting magnitude of change in the seasonaly adjusted graph
retaildata <- readxl::read_excel("C:\\Users\\wanga\\OneDrive\\Desktop\\PREDICTIVE ANALYTICS\\retail.xlsx", skip=1)
tsretail <- ts(retaildata[,"A3349336V"], frequency=12, start=c(1982,4))
x11retail <- tsretail %>% seas(x11="")
autoplot(x11retail)
The trend line seems much more nuanced when compared with the STL or classical decomposition. Similarly, there is greater detail in the seasonal trend data for the x11 decomposition when compared with the other methods.
When considering only trend and seasonal plot, everything looks okay. However, looking at remainder and considering part b’s question, we would note that there was a recession around 1991/1992. This results in a negative remainder during that time period with a magnitude nearly 3 times larger than the next largest.If not for the remainder component, you would miss that significant event. Perhaps x11 or another form of decomposition would be better for this analysis to better capture that depression.
b.Is the recession of 1991/1992 visible in the estimated components?
Yes, the most apparently visible signs of the recession are from the march, april, august and november components where there is a significant downwards trend in the later half of the line.
autoplot(cangas)
cangasplot <- cangas %>% seas(x11="")
cangasplot %>% seasonal() %>% ggsubseriesplot() + ylab("Seasonal")
cangas %>% stl(s.window = "periodic", robust = TRUE) %>% autoplot()
cangas %>% stl(s.window = "periodic") %>% autoplot()
The difference between the two stl() functions is that the top one has robust = TRUE, as done in the book. The only notable difference is the scale of the remainder which is smaller when robust = FALSE (the second graph)
cangas %>% seas(x11 = "") %>% autoplot()
above is the x11 decomp
cangas %>% seas() %>% autoplot()
above is the SEATS decomp
Between all three decompositions, the trend looks similar. Both SEATS and x11 have a more detailed seasonal component and x11 has the smallest remainder/unexplained section out of the three.
bricksq %>% stl(s.window = "periodic") %>% autoplot()
bricksq %>% stl(s.window = 11) %>% autoplot()
bricksq %>% stl(s.window = 21) %>% autoplot()
bricksq %>% stl(s.window = 45) %>% autoplot()
bricksq %>% stl(s.window = 101) %>% autoplot()
Regardless of s.window the remainder and trend essentially stay the same. The only real difference is in the seasonal variation
brickseasadj <- bricksq %>% stl(s.window = "periodic") %>% seasadj()
autoplot(brickseasadj)
autoplot(brickseasadj) +
autolayer(naive(brickseasadj, h=11), series="Naïve", PI=TRUE)
stlfbrick <- stlf(bricksq)
stlfbrick
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1994 Q4 465.7326 437.4903 493.9748 422.5398 508.9254
## 1995 Q1 424.6727 384.7119 464.6335 363.5580 485.7874
## 1995 Q2 483.9914 435.0232 532.9595 409.1010 558.8817
## 1995 Q3 493.9988 437.4242 550.5734 407.4754 580.5222
## 1995 Q4 465.7326 402.4454 529.0198 368.9431 562.5220
## 1996 Q1 424.6727 355.3067 494.0387 318.5865 530.7589
## 1996 Q2 483.9914 409.0259 558.9568 369.3416 598.6411
## 1996 Q3 493.9988 413.8128 574.1848 371.3650 616.6326
autoplot(stlfbrick)
checkresiduals(stlfbrick)
## Warning in checkresiduals(stlfbrick): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 41.128, df = 6, p-value = 2.733e-07
##
## Model df: 2. Total lags used: 8
Ljung box test would indicate that the residuals are correlated to each other.
stlbrick <- stl(bricksq, s.window = "periodic", robust = TRUE) %>% seasadj()
autoplot(stlbrick) +
autolayer(naive(brickseasadj, h=11), series="Naïve", PI=TRUE)
Robust vs non-robust doesnt seem to change too much
bricksq.train <- window(bricksq, end = c(1992,3))
bricksq.test <- window(bricksq, start = c(1992, 4), end = c(1994,3))
trainstlfbrick <- stlf(bricksq.train)
trainsnaivebrick <- snaive(bricksq.train)
fcstlf <- forecast(trainstlfbrick, h = 8)
fcsnaive <- forecast(trainsnaivebrick, h = 8)
fcstlf
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1992 Q4 423.9034 397.7584 450.0484 383.9181 463.8887
## 1993 Q1 378.9095 341.9154 415.9036 322.3319 435.4871
## 1993 Q2 441.9958 396.6620 487.3296 372.6637 511.3279
## 1993 Q3 450.9971 398.6202 503.3739 370.8936 531.1006
## 1993 Q4 423.9034 365.3107 482.4961 334.2936 513.5132
## 1994 Q1 378.9095 314.6874 443.1316 280.6903 477.1287
## 1994 Q2 441.9958 372.5879 511.4036 335.8457 548.1459
## 1994 Q3 450.9971 376.7541 525.2401 337.4523 564.5419
fcsnaive
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1992 Q4 430 366.2905 493.7095 332.5647 527.4353
## 1993 Q1 387 323.2905 450.7095 289.5647 484.4353
## 1993 Q2 413 349.2905 476.7095 315.5647 510.4353
## 1993 Q3 451 387.2905 514.7095 353.5647 548.4353
## 1993 Q4 430 339.9011 520.0989 292.2056 567.7944
## 1994 Q1 387 296.9011 477.0989 249.2056 524.7944
## 1994 Q2 413 322.9011 503.0989 275.2056 550.7944
## 1994 Q3 451 360.9011 541.0989 313.2056 588.7944
Above are values for forecasts. Decimal number forecast is stlf() and integer forecast is snaive()
autoplot(bricksq) +
autolayer(fcstlf, series = "STLF forecast", PI = FALSE) +
autolayer(fcsnaive, series = "SNaive forecast", PI = FALSE) +
xlim(1991, 1994)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
The STLF forecast seems to perform a bit better than the SNAive but not by too much
autoplot(writing)
This is what the data looks like. There is a clear upwards trend so I will use the drift method
BoxCox.lambda(writing)
## [1] 0.1557392
I will use a lambda of zero since the value found is close to zero
stlfdrift <- stlf(BoxCox(writing, 0), s.window = "periodic", robust = TRUE, method = "rwdrift")
autoplot(stlfdrift)
writingbt <- as.data.frame(stlfdrift)
exp(writingbt)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 1978 1031.1688 940.2178 1130.9179 895.3650 1187.5706
## Feb 1978 1072.8512 941.0006 1223.1764 877.8950 1311.1018
## Mar 1978 1163.5092 990.2186 1367.1261 909.1903 1488.9663
## Apr 1978 1063.0223 881.7289 1281.5916 798.6317 1414.9406
## May 1978 1031.5165 836.2041 1272.4479 748.2610 1421.9989
## Jun 1978 1120.1457 889.2101 1411.0573 786.9091 1594.4997
## Jul 1978 902.2753 702.4236 1158.9882 615.2274 1323.2517
## Aug 1978 372.5326 284.7425 487.3896 246.9840 561.9011
## Sep 1978 961.8721 722.4958 1280.5582 620.9325 1490.0136
## Oct 1978 1070.3055 790.6573 1448.8627 673.5441 1700.7851
## Nov 1978 1007.4600 732.4049 1385.8122 618.6516 1640.6254
## Dec 1978 1055.7887 755.7592 1474.9272 633.1733 1760.4813
## Jan 1979 1095.5623 772.5644 1553.6009 642.1379 1869.1574
## Feb 1979 1139.8476 792.1694 1640.1197 653.3752 1988.5243
## Mar 1979 1236.1669 847.0000 1804.1426 693.3691 2203.8892
## Apr 1979 1129.4049 763.1937 1671.3389 620.1953 2056.6995
## May 1979 1095.9316 730.5947 1643.9568 589.4557 2037.5852
## Jun 1979 1190.0955 782.8875 1809.1070 627.2152 2258.1200
## Jul 1979 958.6197 622.4352 1476.3814 495.2335 1855.5930
## Aug 1979 395.7962 253.7150 617.4434 200.4987 781.3248
## Sep 1979 1021.9381 646.8657 1614.4890 507.7806 2056.7103
## Oct 1979 1137.1429 710.8874 1818.9857 554.3732 2332.5336
## Nov 1979 1070.3729 660.9856 1733.3177 512.1209 2237.1633
## Dec 1979 1121.7196 684.3555 1838.5983 526.8391 2388.3095
Above are the point forecasts for stlf() drift with prediciton intervals
autoplot(fancy)
The peaks seem to follow an upwards trend so i will use drift again. Additionally, it is clear that this will need to be transformed using BoxCox.
BoxCox.lambda(fancy)
## [1] 0.002127317
stlfdriftfancy <- stlf(BoxCox(fancy, 0), s.window = "periodic", robust = TRUE, method = "rwdrift")
autoplot(stlfdriftfancy)
fancybt <- as.data.frame(stlfdriftfancy)
exp(fancybt)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 1994 15334.44 12236.303 19216.99 10858.325 21655.73
## Feb 1994 20416.99 14809.533 28147.65 12494.555 33362.81
## Mar 1994 34233.85 23048.942 50846.43 18694.035 62691.46
## Apr 1994 24613.84 15546.509 38969.60 12189.877 49700.36
## May 1994 27503.40 16405.643 46108.35 12479.773 60613.06
## Jun 1994 27386.53 15499.509 48390.05 11466.954 65407.26
## Jul 1994 32989.70 17776.176 61223.54 12813.915 84932.70
## Aug 1994 35605.10 18315.656 69215.26 12882.478 98406.76
## Sep 1994 36806.12 18114.317 74785.62 12446.022 108845.25
## Oct 1994 42146.07 19880.425 89348.77 13355.995 132995.82
## Nov 1994 65955.50 29863.221 145668.43 19632.405 221578.99
## Dec 1994 143727.91 62545.803 330281.34 40263.802 513059.14
## Jan 1995 21058.40 8817.238 50294.25 5561.376 79738.60
## Feb 1995 28038.15 11306.394 69530.37 6990.819 112452.89
## Mar 1995 47012.50 18273.576 120949.22 11080.933 199457.46
## Apr 1995 33801.58 12673.867 90149.84 7540.212 151527.16
## May 1995 37769.74 13670.037 104356.23 7982.157 178717.81
## Jun 1995 37609.24 13147.287 107585.33 7537.054 187666.86
## Jul 1995 45303.94 15304.899 134103.92 8616.610 238196.58
## Aug 1995 48895.60 15971.019 149694.87 8832.691 270673.97
## Sep 1995 50544.93 15970.014 159974.20 8678.091 294395.42
## Oct 1995 57878.16 17696.492 189296.36 9450.609 354461.95
## Nov 1995 90575.06 26809.657 306003.23 14073.600 582924.20
## Dec 1995 197377.98 56577.532 688578.40 29199.805 1334189.27
Above are the point forecasts for the fancy predictions using stlf() drift and their prediction intervals