I. Clean the enviroment and install required packages
rm(list = ls())
cat("\f")
packages <- c("fpp2", "seasonal")
for (i in 1:length(packages)){
if(!packages[i] %in% installed.packages()){
install.packages(packages[i])
}
library(packages[i], character.only = T)
}
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## -- Attaching packages ---------------------------------------------- fpp2 2.4 --
## v ggplot2 3.3.5 v fma 2.4
## v forecast 8.15 v expsmooth 2.3
##
rm(packages)
setwd("C:/Users/H/Desktop/Predictive Analytics")
1a) Demand on electricity has a positive relationship with temperature. One of possible explainations for this approximately possitive linear relationship could be that the 20 days we use to construct the plot is during mostly summer, so the usage of air conditioners increases as temperature increases, leading to a higher demand on electricity.
daily20 <- head(elecdaily,20)
autoplot(daily20, facets = T)
daily20 %>%
as.data.frame() %>%
ggplot(aes(y = Demand, x = Temperature)) +
geom_point() +
geom_smooth(method = "lm", se = F)
## `geom_smooth()` using formula 'y ~ x'
1b) No, this model does not seem adequate because residuals do not have a constant variance, so the issue of heteroskedasticity exists. Additionally, the residual distribution plot shows that residuals are not normally distributed. Furthermore, there is no outlier observed.
model1 <- tslm(Demand ~ Temperature, data = daily20)
checkresiduals(model1)
##
## Breusch-Godfrey test for serial correlation of order up to 5
##
## data: Residuals from Linear regression model
## LM test = 3.8079, df = 5, p-value = 0.5774
1c) No, I would not say the results are convincing as the demand on electricity is also expected to be high when temperature is as low as 15 degree due to the increasing usage on heaters.
forecast(model1, newdata = data.frame(Temperature = c(15, 35)))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 4.285714 140.5701 108.6810 172.4591 90.21166 190.9285
## 4.428571 275.7146 245.2278 306.2014 227.57056 323.8586
1d) Demand of electricity has a approximately quadratic relationship with temperature. The reason for it could be that during the days with low temperature, higher demand on heating leads to more electricity usage, and air conditioners are frequently used during hotter days, which also results in higher demand on electricity.
elecdaily %>%
as.data.frame() %>%
ggplot(aes(y = Demand, x = Temperature)) +
geom_point() +
stat_smooth(aes(y=Demand), method = "lm", formula = y ~ x + I(x^2), se = F)
2a) Winning time decreases over time or we can say that athlete improves with time.
autoplot(mens400)
2b) On average, the winning time will decrease 0.25830 second per year.
mens <- data.frame(year = 1986:2016, winning_time = mens400)
mens %>%
ggplot(aes(x = year, y = winning_time)) +
geom_point() +
geom_smooth(method = "lm", se = F)
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).
model2 <- lm(winning_time ~ year, data = mens)
summary(model2)
##
## Call:
## lm(formula = winning_time ~ year, data = mens)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6002 -0.5747 -0.2858 0.5751 4.1505
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 563.02409 46.95546 11.99 4.27e-12 ***
## year -0.25830 0.02346 -11.01 2.75e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.136 on 26 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.8234, Adjusted R-squared: 0.8166
## F-statistic: 121.2 on 1 and 26 DF, p-value: 2.752e-11
2c) The residual plot shows a decreasing variance trend and its distribution is not normal, but the residuals have a roughly 0 mean. Therefore, forecast based on this model might be good, but prediction interval which is assumed normal distribution might not be reliable.
checkresiduals(model2)
##
## Breusch-Godfrey test for serial correlation of order up to 6
##
## data: Residuals
## LM test = 3.6082, df = 6, p-value = 0.7295
2c) Prediction intervals is based on the assumption of normally distributed residuals.
pred <- forecast(model2, newdata = data.frame(year = 2020))
pred
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1 41.26742 39.64486 42.88999 38.73108 43.80377
plot(pred)
easter(ausbeer)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1956 0.67 0.33 0.00 0.00
## 1957 0.00 1.00 0.00 0.00
## 1958 0.00 1.00 0.00 0.00
## 1959 1.00 0.00 0.00 0.00
## 1960 0.00 1.00 0.00 0.00
## 1961 0.33 0.67 0.00 0.00
## 1962 0.00 1.00 0.00 0.00
## 1963 0.00 1.00 0.00 0.00
## 1964 1.00 0.00 0.00 0.00
## 1965 0.00 1.00 0.00 0.00
## 1966 0.00 1.00 0.00 0.00
## 1967 1.00 0.00 0.00 0.00
## 1968 0.00 1.00 0.00 0.00
## 1969 0.00 1.00 0.00 0.00
## 1970 1.00 0.00 0.00 0.00
## 1971 0.00 1.00 0.00 0.00
## 1972 0.33 0.67 0.00 0.00
## 1973 0.00 1.00 0.00 0.00
## 1974 0.00 1.00 0.00 0.00
## 1975 1.00 0.00 0.00 0.00
## 1976 0.00 1.00 0.00 0.00
## 1977 0.00 1.00 0.00 0.00
## 1978 1.00 0.00 0.00 0.00
## 1979 0.00 1.00 0.00 0.00
## 1980 0.00 1.00 0.00 0.00
## 1981 0.00 1.00 0.00 0.00
## 1982 0.00 1.00 0.00 0.00
## 1983 0.00 1.00 0.00 0.00
## 1984 0.00 1.00 0.00 0.00
## 1985 0.00 1.00 0.00 0.00
## 1986 1.00 0.00 0.00 0.00
## 1987 0.00 1.00 0.00 0.00
## 1988 0.00 1.00 0.00 0.00
## 1989 1.00 0.00 0.00 0.00
## 1990 0.00 1.00 0.00 0.00
## 1991 1.00 0.00 0.00 0.00
## 1992 0.00 1.00 0.00 0.00
## 1993 0.00 1.00 0.00 0.00
## 1994 0.00 1.00 0.00 0.00
## 1995 0.00 1.00 0.00 0.00
## 1996 0.00 1.00 0.00 0.00
## 1997 1.00 0.00 0.00 0.00
## 1998 0.00 1.00 0.00 0.00
## 1999 0.00 1.00 0.00 0.00
## 2000 0.00 1.00 0.00 0.00
## 2001 0.00 1.00 0.00 0.00
## 2002 1.00 0.00 0.00 0.00
## 2003 0.00 1.00 0.00 0.00
## 2004 0.00 1.00 0.00 0.00
## 2005 1.00 0.00 0.00 0.00
## 2006 0.00 1.00 0.00 0.00
## 2007 0.00 1.00 0.00 0.00
## 2008 1.00 0.00 0.00 0.00
## 2009 0.00 1.00 0.00 0.00
## 2010 0.00 1.00
\[ \log\ y = \beta_0 + \beta_1\ \log\ x\ + \varepsilon \] \[ y = 10 ^ {\beta_0 + \varepsilon} x ^ {\beta_1} \] \[ \displaystyle \frac{\partial y}{\partial x} = 10^{\beta_0 + \varepsilon} \beta_1 x ^{\beta_1 - 1} \] \[ \displaystyle \frac{\partial y}{\partial x} \frac {x}{y} = \frac {10^{\beta_0 + \varepsilon} \beta_1 x ^{\beta_1}} {y} = \frac {10^{\beta_0 + \varepsilon} \beta_1 x ^{\beta_1}} {10 ^ {\beta_0 + \varepsilon} x ^ {\beta_1}} = \beta_1 \]
5a) Sale data in 1991 seems unusual because its decreasing sale volume is abnormal compared to increasingly expanding volume over years.
autoplot(fancy)
5b) After log transformation, seasonality in the early years become clear because those original values were relatively too small compared to the values of few years later. With a relatively simpler patterns, forecasts can be more accurate.
autoplot(log(fancy))
seasonplot(fancy)
seasonplot(log(fancy))
5c)
fancy_data <- data.frame(sale = fancy
, year = rep(1987:1993, each = 12)
, month = rep(1:12, 7))
festival = ifelse(fancy_data$year>1987 & fancy_data$month == 3, 1, 0)
model3 <- tslm(log(fancy) ~ trend + season + festival)
summary(model3)
##
## Call:
## tslm(formula = log(fancy) ~ trend + season + festival)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33673 -0.12757 0.00257 0.10911 0.37671
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6196670 0.0742471 102.626 < 2e-16 ***
## trend 0.0220198 0.0008268 26.634 < 2e-16 ***
## season2 0.2514168 0.0956790 2.628 0.010555 *
## season3 0.2660828 0.1934044 1.376 0.173275
## season4 0.3840535 0.0957075 4.013 0.000148 ***
## season5 0.4094870 0.0957325 4.277 5.88e-05 ***
## season6 0.4488283 0.0957647 4.687 1.33e-05 ***
## season7 0.6104545 0.0958039 6.372 1.71e-08 ***
## season8 0.5879644 0.0958503 6.134 4.53e-08 ***
## season9 0.6693299 0.0959037 6.979 1.36e-09 ***
## season10 0.7473919 0.0959643 7.788 4.48e-11 ***
## season11 1.2067479 0.0960319 12.566 < 2e-16 ***
## season12 1.9622412 0.0961066 20.417 < 2e-16 ***
## festival 0.5015151 0.1964273 2.553 0.012856 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared: 0.9567, Adjusted R-squared: 0.9487
## F-statistic: 119 on 13 and 70 DF, p-value: < 2.2e-16
5d) From the scatter plot, we can see that the fitted values are positively correlated with the actual value and residual distribution is approximately normal except for some outliers on the right side. Also, there is clear patterns between fitted value and residuals. Therefore, fitted values are well predicted and prediction interval is reliable. However, since there is an obvious pattern in the residual plot, which is confirmed in ACF plot, we can conclude that some useful factors are missing in the model and thus the model can be further improved.
checkresiduals(model3)
##
## Breusch-Godfrey test for serial correlation of order up to 17
##
## data: Residuals from Linear regression model
## LM test = 37.954, df = 17, p-value = 0.002494
df <- cbind(fit = as.vector(model3$fitted.values)
, act = fancy_data$sale
, res = as.vector(model3$residuals))
df %>%
as.data.frame() %>%
ggplot(aes(x=fit, y=act))+
geom_point() +
xlab("Fitted.value") +
ylab("Actual.value") +
geom_smooth(method = "lm", se = F)
## `geom_smooth()` using formula 'y ~ x'
df %>%
as.data.frame() %>%
ggplot(aes(x=fit, y=res))+
geom_point() +
xlab("Fitted.value") +
ylab("Residuals")
5e) From the boxplot, we can see that residuals for most months are distributed around 0. However, residuals of each month do not show a constant variance, variance tend to be small during the middle of each year, and it tend to be large during the start and end of each year.
boxplot(residuals(model3) ~ cycle(residuals(model3)))
5f) From the coefficient below, we can see there is a clear upward trend over time, and the significant festival dummy variable shows that the festival does play an important role affecting the sales.
coef(model3)
## (Intercept) trend season2 season3 season4 season5
## 7.61966701 0.02201983 0.25141682 0.26608280 0.38405351 0.40948697
## season6 season7 season8 season9 season10 season11
## 0.44882828 0.61045453 0.58796443 0.66932985 0.74739195 1.20674790
## season12 festival
## 1.96224123 0.50151509
5g) The p value of 0.002494 in Breusch-Godfrey test indicates that residuals are distinguishable from a white noise series, and they have autocorrelation.
checkresiduals(model3)
##
## Breusch-Godfrey test for serial correlation of order up to 17
##
## data: Residuals from Linear regression model
## LM test = 37.954, df = 17, p-value = 0.002494
5h)
df <- data.frame(year = rep(1994:1996, each = 12)
, month = rep(1:12, 3))
df$festival <- ifelse(df$month == 3, 1, 0)
pred.fancy <- forecast(model3, newdata = df)
pred.fancy
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 1994 9.491352 9.238522 9.744183 9.101594 9.88111
## Feb 1994 9.764789 9.511959 10.017620 9.375031 10.15455
## Mar 1994 10.302990 10.048860 10.557120 9.911228 10.69475
## Apr 1994 9.941465 9.688635 10.194296 9.551707 10.33122
## May 1994 9.988919 9.736088 10.241749 9.599161 10.37868
## Jun 1994 10.050280 9.797449 10.303110 9.660522 10.44004
## Jul 1994 10.233926 9.981095 10.486756 9.844168 10.62368
## Aug 1994 10.233456 9.980625 10.486286 9.843698 10.62321
## Sep 1994 10.336841 10.084010 10.589671 9.947083 10.72660
## Oct 1994 10.436923 10.184092 10.689753 10.047165 10.82668
## Nov 1994 10.918299 10.665468 11.171129 10.528541 11.30806
## Dec 1994 11.695812 11.442981 11.948642 11.306054 12.08557
## Jan 1995 9.755590 9.499844 10.011336 9.361338 10.14984
## Feb 1995 10.029027 9.773281 10.284773 9.634775 10.42328
## Mar 1995 10.567228 10.310518 10.823938 10.171489 10.96297
## Apr 1995 10.205703 9.949957 10.461449 9.811451 10.59996
## May 1995 10.253157 9.997411 10.508903 9.858904 10.64741
## Jun 1995 10.314518 10.058772 10.570264 9.920265 10.70877
## Jul 1995 10.498164 10.242418 10.753910 10.103911 10.89242
## Aug 1995 10.497694 10.241948 10.753440 10.103441 10.89195
## Sep 1995 10.601079 10.345333 10.856825 10.206826 10.99533
## Oct 1995 10.701161 10.445415 10.956907 10.306908 11.09541
## Nov 1995 11.182537 10.926791 11.438282 10.788284 11.57679
## Dec 1995 11.960050 11.704304 12.215796 11.565797 12.35430
## Jan 1996 10.019828 9.760564 10.279093 9.620151 10.41951
## Feb 1996 10.293265 10.034000 10.552530 9.893588 10.69294
## Mar 1996 10.831466 10.571566 11.091365 10.430810 11.23212
## Apr 1996 10.469941 10.210677 10.729206 10.070264 10.86962
## May 1996 10.517395 10.258130 10.776659 10.117718 10.91707
## Jun 1996 10.578756 10.319491 10.838021 10.179079 10.97843
## Jul 1996 10.762402 10.503137 11.021667 10.362725 11.16208
## Aug 1996 10.761932 10.502667 11.021196 10.362254 11.16161
## Sep 1996 10.865317 10.606052 11.124582 10.465640 11.26499
## Oct 1996 10.965399 10.706134 11.224664 10.565722 11.36508
## Nov 1996 11.446774 11.187510 11.706039 11.047097 11.84645
## Dec 1996 12.224288 11.965023 12.483552 11.824611 12.62396
autoplot(pred.fancy)
5i)
exp(as.data.frame(pred.fancy))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 1994 13244.70 10285.82 17054.73 8969.583 19557.43
## Feb 1994 17409.81 13520.45 22418.00 11790.284 25707.73
## Mar 1994 29821.65 23129.40 38450.24 20155.412 44123.68
## Apr 1994 20774.16 16133.21 26750.16 14068.696 30675.62
## May 1994 21783.73 16917.24 28050.15 14752.395 32166.37
## Jun 1994 23162.27 17987.81 29825.24 15685.969 34201.95
## Jul 1994 27831.56 21613.98 35837.72 18848.111 41096.73
## Aug 1994 27818.48 21603.82 35820.87 18839.249 41077.41
## Sep 1994 30848.42 23956.87 39722.43 20891.193 45551.50
## Oct 1994 34095.57 26478.61 43903.67 23090.230 50346.32
## Nov 1994 55176.84 42850.31 71049.28 37366.903 81475.41
## Dec 1994 120067.79 93244.59 154607.08 81312.400 177294.90
## Jan 1995 17250.40 13357.65 22277.59 11629.938 25587.08
## Feb 1995 22675.20 17558.28 29283.31 15287.252 33633.55
## Mar 1995 38840.85 30046.98 50208.44 26146.972 57697.39
## Apr 1995 27057.06 20951.33 34942.16 18241.435 40133.06
## May 1995 28371.96 21969.51 36640.25 19127.918 42083.42
## Jun 1995 30167.42 23359.80 38958.95 20338.387 44746.58
## Jul 1995 36248.88 28068.91 46812.70 24438.412 53767.06
## Aug 1995 36231.84 28055.72 46790.69 24426.922 53741.78
## Sep 1995 40178.16 31111.50 51887.06 27087.467 59595.26
## Oct 1995 44407.37 34386.35 57348.77 29938.733 65868.34
## Nov 1995 71864.42 55647.40 92807.48 48449.831 106594.69
## Dec 1995 156380.86 121091.75 201954.08 105429.448 231955.81
## Jan 1996 22467.57 17336.40 29117.46 15065.329 33506.86
## Feb 1996 29533.04 22788.25 38274.14 19802.984 44043.89
## Mar 1996 50587.81 39009.73 65602.25 33887.802 75517.62
## Apr 1996 35240.15 27191.96 45670.42 23629.808 52555.15
## May 1996 36952.72 28513.41 47889.88 24778.151 55109.18
## Jun 1996 39291.20 30317.82 50920.48 26346.183 58596.65
## Jul 1996 47211.93 36429.60 61185.57 31657.322 70409.18
## Aug 1996 47189.73 36412.48 61156.80 31642.439 70376.07
## Sep 1996 52329.57 40378.47 67817.91 35088.887 78041.33
## Oct 1996 57837.85 44628.77 74956.52 38782.394 86256.08
## Nov 1996 93598.96 72222.70 121302.09 62761.521 139588.15
## Dec 1996 203676.38 157160.50 263959.89 136572.460 303751.35
5j) To get a better result, boxcox transformation can be used with different lambda value to see whether it has a better fit than that of log transformation.
6a)
gas.ts <- window(gasoline, end = 2005)
autoplot(gas.ts
, xlab = "date"
, ylab = "gasoline product" )
for(num in c(5, 10, 15)){
mod <- tslm(gas.ts ~ trend + fourier(gas.ts, K = num))
print(
autoplot(gas.ts) +
autolayer(mod$fitted.values,
series = as.character(num)) +
ylab("gasoline") +
guides(colour = guide_legend(title = "Number of Fourier Transform pairs"))
)
}
6b)
Fourier terms of 12 would be the most appropriate as it has both the lowest cv value and aicc value.
gas.df <- data.frame(index = 1:26
, cv = rep(NA, 26)
, aicc = rep(NA, 26))
for(num in 1:round(frequency(gasoline)/2)){
mod <- tslm(gas.ts ~ trend + fourier(gas.ts, K = num))
gas.df$cv[num] <- CV(mod)[[1]]
gas.df$aicc[num] <- CV(mod)[[3]]
}
which.min(gas.df$cv)
## [1] 12
which.min(gas.df$aicc)
## [1] 12
har.12 <- tslm(gas.ts ~ trend + fourier(gas.ts, K = 12))
autoplot(gas.ts) +
autolayer(har.12$fitted.values
, series = as.character(12))+
ylab("gasoline") +
guides(colour = guide_legend(title = "Number of Fourier Transform pairs"))
6c)
checkresiduals(har.12)
##
## Breusch-Godfrey test for serial correlation of order up to 104
##
## data: Residuals from Linear regression model
## LM test = 154.2, df = 104, p-value = 0.001021
6d)
fc <- forecast(har.12, newdata = data.frame(fourier(x = gas.ts, K = 12, h = 52)))
fc
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2005.014 8.713386 8.371087 9.055684 8.189474 9.237298
## 2005.033 8.642131 8.299720 8.984542 8.118047 9.166215
## 2005.052 8.656942 8.314540 8.999345 8.132871 9.181013
## 2005.071 8.661292 8.318899 9.003684 8.137236 9.185347
## 2005.090 8.686370 8.344109 9.028631 8.162515 9.210225
## 2005.110 8.784129 8.441982 9.126276 8.260448 9.307809
## 2005.129 8.895967 8.553823 9.238111 8.372291 9.419643
## 2005.148 8.937884 8.595756 9.280012 8.414232 9.461535
## 2005.167 8.940875 8.598761 9.282988 8.417246 9.464503
## 2005.186 8.988596 8.646477 9.330716 8.464958 9.512235
## 2005.205 9.062316 8.720191 9.404442 8.538669 9.585963
## 2005.225 9.073794 8.731673 9.415915 8.550153 9.597435
## 2005.244 9.031913 8.689800 9.374027 8.508284 9.555542
## 2005.263 9.041458 8.699342 9.383575 8.517824 9.565092
## 2005.282 9.122877 8.780756 9.464999 8.599236 9.646519
## 2005.301 9.169383 8.827263 9.511503 8.645744 9.693022
## 2005.320 9.132601 8.790487 9.474715 8.608970 9.656231
## 2005.340 9.120782 8.778667 9.462897 8.597150 9.644414
## 2005.359 9.221551 8.879431 9.563671 8.697912 9.745190
## 2005.378 9.335425 8.993306 9.677545 8.811787 9.859064
## 2005.397 9.316637 8.974522 9.658752 8.793006 9.840269
## 2005.416 9.214109 8.871994 9.556224 8.690478 9.737740
## 2005.435 9.218957 8.876838 9.561076 8.695320 9.742595
## 2005.455 9.383861 9.041741 9.725980 8.860222 9.907499
## 2005.474 9.545162 9.203046 9.887278 9.021530 10.068795
## 2005.493 9.559667 9.217553 9.901782 9.036037 10.083298
## 2005.512 9.487053 9.144935 9.829171 8.963416 10.010689
## 2005.531 9.464953 9.122834 9.807073 8.941315 9.988592
## 2005.550 9.508378 9.166261 9.850494 8.984744 10.032011
## 2005.570 9.534922 9.192808 9.877037 9.011292 10.058553
## 2005.589 9.521986 9.179869 9.864104 8.998351 10.045621
## 2005.608 9.522899 9.180779 9.865019 8.999260 10.046538
## 2005.627 9.548443 9.206326 9.890560 9.024808 10.072078
## 2005.646 9.536705 9.194591 9.878819 9.013075 10.060335
## 2005.665 9.451014 9.108897 9.793131 8.927380 9.974649
## 2005.685 9.330630 8.988509 9.672750 8.806990 9.854269
## 2005.704 9.229844 8.887726 9.571962 8.706208 9.753480
## 2005.723 9.170862 8.828748 9.512976 8.647232 9.694492
## 2005.742 9.168084 8.825968 9.510200 8.644451 9.691717
## 2005.761 9.230990 8.888869 9.573110 8.707349 9.754630
## 2005.780 9.320372 8.978252 9.662492 8.796734 9.844010
## 2005.800 9.363916 9.021801 9.706030 8.840285 9.887546
## 2005.819 9.342664 9.000548 9.684779 8.819032 9.866296
## 2005.838 9.304159 8.962036 9.646281 8.780516 9.827801
## 2005.857 9.267511 8.925389 9.609633 8.743869 9.791153
## 2005.876 9.194408 8.852292 9.536523 8.670776 9.718040
## 2005.895 9.100701 8.758587 9.442816 8.577070 9.624332
## 2005.915 9.104667 8.762540 9.446794 8.581017 9.628317
## 2005.934 9.265935 8.923806 9.608064 8.742282 9.789587
## 2005.953 9.436660 9.094538 9.778782 8.913018 9.960302
## 2005.972 9.400410 9.058293 9.742527 8.876776 9.924044
## 2005.991 9.147441 8.805233 9.489649 8.623668 9.671215
6e) From the plot, we can clearly see that the prediction interval contains most of the actual value, but it is lack the ability to catch the sudden drops.
autoplot(fc) +
autolayer(fc, series = "predict") +
autolayer(window(gasoline
, start = 2004
, end = 2006)
, series = "Actual") +
coord_cartesian(xlim = c(2004, 2006))
7a) The water level of Lake Huron has a approximately downward trend over years, but its year-to-year variations seem increase.
autoplot(huron)
7b)
fit.lin <- tslm(huron ~ trend)
t = time(huron)
tb <- pmax(0, t-1915)
fit.pw <- tslm(huron ~ t + tb)
autoplot(huron) +
autolayer(fitted(fit.lin), series = "Linear") +
autolayer(fitted(fit.pw), series = "Piecewise") +
xlab("Year") + ylab("Water Level") +
ggtitle("Lake Huron") +
guides(colour = guide_legend(title = " "))
7c) Because the coefficient of linear model is negative, showing a downward trend, the forecasts based on it is relatively lower than that of piecewise linear model. I would say the piecewise model make more sense because we can see from the plot that the downward trend is somehow mitigated after the year of 1915.
h = 8
fcasts.lin <- forecast(fit.lin, h = h)
t.new <- t[length(t)] + seq(h)
tb.new <- tb[length(tb)] + seq(h)
newdata <- cbind(t=t.new, tb=tb.new) %>%
as.data.frame()
fcasts.pw <- forecast(fit.pw, newdata = newdata)
fcasts.lin
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1973 7.806127 6.317648 9.294605 5.516501 10.095752
## 1974 7.781926 6.292536 9.271315 5.490899 10.072952
## 1975 7.757724 6.267406 9.248043 5.465269 10.050179
## 1976 7.733523 6.242259 9.224788 5.439613 10.027434
## 1977 7.709322 6.217094 9.201550 5.413929 10.004715
## 1978 7.685121 6.191912 9.178331 5.388219 9.982024
## 1979 7.660920 6.166712 9.155128 5.362481 9.959359
## 1980 7.636719 6.141494 9.131943 5.336717 9.936721
fcasts.pw
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1973 8.455119 7.063583 9.846655 6.314483 10.59575
## 1974 8.454992 7.061518 9.848467 6.311374 10.59861
## 1975 8.454866 7.059398 9.850333 6.308182 10.60155
## 1976 8.454739 7.057225 9.852253 6.304906 10.60457
## 1977 8.454612 7.054998 9.854227 6.301549 10.60768
## 1978 8.454486 7.052717 9.856254 6.298109 10.61086
## 1979 8.454359 7.050384 9.858334 6.294587 10.61413
## 1980 8.454232 7.047997 9.860467 6.290984 10.61748
\[ 3 \times 5 MA : \\ \hat {T}_t = \frac {1}{3} [\frac {1}{5}(y_{t-3}+y_{t-2}+y_{t-1}+y+y_{t+1})+ \frac {1}{5}(y_{t-2}+y_{t-1}+y+y_{t+1}+y_{t+2})\\+ \frac {1}{5}(y_{t-1}+y+y_{t+1}+y_{t+2}+y_{t+3})] \]
\[ \hat {T}_t = \frac {1}{3}(\frac {1}{5}y_{t-3}+\frac {2}{5}y_{t-2}+\frac {3}{5}y_{t-1}+\frac {3}{5}y+\frac {3}{5}y_{t+1}+\frac {2}{5}y_{t+2}+\frac {1}{5}y_{t+3}) \]
\[ \hat {T}_t = \frac {1}{15}y_{t-3}+\frac {2}{15}y_{t-2}+\frac {1}{5}y_{t-1}+\frac {1}{5}y+\frac {1}{5}y_{t+1}+\frac {2}{15}y_{t+2}+\frac {1}{15}y_{t+3} \\ \approx 0.067\ y_{t-3}+0.133\ y_{t-2}+0.200\ y_{t-1}+0.200\ y+0.200\ y_{t+1}+0.133\ y_{t+2}+0.067\ y_{t+3} \]
autoplot(plastics)
2b)
m = 12
t_hat <- ma(plastics, order = 12, centre = T)
dtrend <- plastics / t_hat
autoplot(t_hat, main = "Trend")
s_hat <- ts(rep(tapply(dtrend, cycle(dtrend), mean, na.rm = T),5),frequency = 12)
autoplot(s_hat, main = "seasonality")
r_hat <- plastics / (t_hat * s_hat)
plot(x = 1:60, y = r_hat-1, type = "h", main = "Residual");abline(h = 0);
plastics %>% decompose(type="multiplicative") %>%
autoplot() + xlab("Year") +
ggtitle("Classical multiplicative decomposition")
2c) Yes, the result shown above supports the foundings in 2a.
2d)
decomp <- plastics %>% decompose(type="multiplicative")
autoplot(plastics, series = "Original data") +
autolayer(seasadj(decomp), series = "Seasonally Adjusted data")
2e) After generating an outlier, the seasonally adjusted series seems to be influenced very much. After the unusual spike, the adjusted series start to fail to catch up with the pattern, even though the following pattern is virtually identical to others.
set.seed(123)
new.data <- ts(replace(plastics, sample(60, 1), plastics[sample(60, 1)]+500)
, frequency = 12)
autoplot(new.data)
decomp.new <- new.data %>% decompose(type="multiplicative")
autoplot(new.data, series = "Original data") +
autolayer(seasadj(decomp.new), series = "Seasonally Adjusted data")
2f) The time series whose outlier is in the middle seems to have a smaller influence that that of the time series whose outlier is close to the end, but basically, in both plot, outliers would be more influential on data close to the outliers. As time goes by, the influence would be mitigated.
new1 <- new2 <- plastics
new1[15] <- new1[15] + 500
new2[50] <- new2[50] + 500
decomp.new1 <- new1 %>% decompose(type="multiplicative")
autoplot(new1, series = "Original data") +
autolayer(seasadj(decomp.new1), series = "Seasonally Adjusted data")
decomp.new2 <- new2 %>% decompose(type="multiplicative")
autoplot(new2, series = "Original data") +
autolayer(seasadj(decomp.new2), series = "Seasonally Adjusted data")
retaildata <- readxl::read_excel("retail.xlsx", skip=1)
myts <- ts(retaildata[,"A3349873A"],
frequency=12, start=c(1982,4))
autoplot(myts)
ggseasonplot(myts)
ggsubseriesplot(myts)
gglagplot(myts)
ggAcf(myts)
fit <- seas(myts, x11 = "")
autoplot(fit) +
ggtitle("X11 decomposition")
4a) From the seasonal chart in figure 6.16, we can see that seasonal patterns change vary slowly over time, so the seasonal patterns are virtually identical from one year to the next. However, years far apart have different patterns. For example, number of persons in the civilian labor force in Australia tend to have two peaks in each year after 1983 and return to single pick after 1987. Furthermore, the overall trend is upward, even though the trend was close to be flat during the year of 1991 and 1992, so it failed to catch the recession of 1991/1992, which results in the outliers in remainder plot. That is also the reason why remainder plot’s scale is larger than the seasonal plot.
4b) The recession of year 1991/1992 can be detected in the seasonally adjusted plot.
labor.fit <- stl(labour, s.window = 13, robust = TRUE)
autoplot(labour, series="Original") +
autolayer(trendcycle(labor.fit), series="Trend") +
autolayer(seasadj(labor.fit), series="Seasonally Adjusted") +
xlab("Year") + ylab("Number of people") +
ggtitle("Number of people in the civilian labour force in Australia")
5a) The seasonal plot are generally concave up, but the plot shows more fluctuate patterns of summer months in recent years. This dramatic change on seasonality could be caused by the widespread of air conditioner in the summer and heater in the winter.
autoplot(cangas)
ggsubseriesplot(cangas)
ggseasonplot(cangas)
5b) By roughly looking at the seasonal plot above, the seasonal patterns for each year can be clustered into three groups: one for very flat seasonal plot at the bottom from about 1960 to 1975, one for middle plots that are obviously concave up from roughly 1976 to 1990, and the last for seasonal plots from 1991 to 2005, in which seasonal patterns seem to fluctuate in the middle of each year. Therefore, I set s.window to 15.
stl.congas <- stl(cangas, s.window = 15)
autoplot(stl.congas) +
ggtitle("Monthly Canadian Gas Production",
subtitle = "STL decomposition")
autoplot(cangas, series = "Original") +
autolayer(seasadj(stl.congas), series = "Seasonally Adjusted") +
ggtitle("Monthly Canadian Gas Production",
subtitle = "Seasonally Adjusted Plot")
5c) Compared to STL method, x11 and SEATs method have very different seasonality and relatively larger scale of remainer plot, indicating that STL is more robust method in this case.
cangas.x11 <- seas(cangas, x11 = "")
cangas.seats <- seas(cangas)
autoplot(cangas.x11) +
ggtitle("Monthly Canadian Gas Production",
subtitle = "X11 decomposition")
autoplot(cangas, series = "Original") +
autolayer(seasadj(cangas.x11), series = "Seasonally Adjusted") +
ggtitle("X11 decomposition")
autoplot(cangas.seats) +
ggtitle("Monthly Canadian Gas Production",
subtitle = "SEATS decomposition")
autoplot(cangas, series = "Original") +
autolayer(seasadj(cangas.seats), series = "Seasonally Adjusted") +
ggtitle("SEATS decomposition")
6a)
bri.stl.fix <- stl(bricksq, s.window = "periodic")
bri.stl.dif <- stl(bricksq, s.window = 9)
6b)
autoplot(bri.stl.fix) +
ggtitle("Quarterly clay brick production in Australia",
subtitle = "Fixed seasonality STL decomposition")
autoplot(bricksq, series = "Original") +
autolayer(seasadj(bri.stl.fix), series = "Seasonally Adjusted") +
ggtitle("Fixed seasonality STL decomposition")
autoplot(bri.stl.dif) +
ggtitle("Quarterly clay brick production in Australia",
subtitle = "Changing seasonality STL decomposition")
autoplot(bricksq, series = "Original") +
autolayer(seasadj(bri.stl.dif), series = "Seasonally Adjusted") +
ggtitle("Changing seasonality STL decomposition")
6c) We can see that the prediction interval of the STL with changing seasonality is narrower than that of the fixed-seasonality STL, as the scale of remainder components for the former is smaller than the latter.
bri.stl.fix %>% seasadj() %>% naive() %>% autoplot() +
ggtitle(label = "Naive forecast of seasonal-fixed adjusted brick data")
bri.stl.dif %>% seasadj() %>% naive() %>% autoplot() +
ggtitle(label = "Naive forecast of seasonal-changing adjusted brick data")
6d)
stlf(bricksq) %>%
autoplot() +
ggtitle(label = "STL forecast")
6e) Ljung-Box test indicates that we can confidently exclude the residual from random walk, so the residuals are correlated.
stlf(bricksq) %>%
checkresiduals()
## Warning in checkresiduals(.): The fitted degrees of freedom is based on the
## model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 44.704, df = 6, p-value = 5.358e-08
##
## Model df: 2. Total lags used: 8
6f) No, there is no much difference.
bri.robust <- stlf(bricksq, robust = T)
autoplot(bri.robust) +
ggtitle(label = "STL forecast")
checkresiduals(bri.robust)
## Warning in checkresiduals(bri.robust): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 27.278, df = 6, p-value = 0.0001284
##
## Model df: 2. Total lags used: 8
6g) Forecast from robust STL seems to have a better capture on the original data.
bri.train <- window(bricksq, end = c(1992,4))
bri.test <- window(bricksq, start = c(1993,1))
bri.stlf <- stlf(bri.train, robust = T)
bri.naive <- snaive(bri.train)
autoplot(bricksq, series = "Original") +
autolayer(bri.stlf, PI = FALSE, size = 1,
series = "stlf") +
autolayer(bri.naive, PI = FALSE, size = 1,
series = "snaive") +
scale_x_continuous(limits = c(1990, 1994.5)) +
scale_y_continuous(limits = c(350, 550))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
autoplot(writing)
writing.rwdrift <- stlf(writing,
s.window = 13,
robust = TRUE,
lambda = BoxCox.lambda(writing),
method = "rwdrift")
autoplot(writing.rwdrift)
writing.naive <- stlf(writing,
s.window = 13,
robust = TRUE,
lambda = BoxCox.lambda(writing),
method = "naive")
autoplot(writing.naive)
autoplot(fancy)
fancy.rwdrift <- stlf(fancy,
s.window = 13,
robust = TRUE,
lambda = BoxCox.lambda(fancy),
method = "rwdrift")
autoplot(fancy.rwdrift)
fancy.naive <- stlf(fancy,
s.window = 13,
robust = TRUE,
lambda = BoxCox.lambda(fancy),
method = "naive")
autoplot(fancy.naive)