library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.3 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
For cases 3 and 4 in Section 1.5, list the possible predictor variables that might be useful, assuming that the relevant data are available.
Case 3: Company wants help predicting the resale value of the cars they own after leasing them for about three years.
Possible predictors: Price of cars when bought, Eventual sales price of cars sold, condition of car, type of car (make, model, year, color…), repairs made on the car, miles on the odometer, price of materials used to make the car, popularity of the car…
Case 4: Forecasting weekly air passenger traffic on major domestic routes.
Possible predictors: week, number of passengers in previous week, holidays, events, number of full flights, route, class of passengers seat, average number of passengers per flight, number of flights that run that route…
The aus_arrivals data set comprises quarterly international arrivals to Australia from Japan, New Zealand, UK and the US.
str(aus_arrivals)
## tbl_ts [508 × 3] (S3: tbl_ts/tbl_df/tbl/data.frame)
## $ Quarter : qtr [1:508] 1981 Q1, 1981 Q2, 1981 Q3, 1981 Q4, 1982 Q1, 1982 Q2, 1982...
## $ Origin : chr [1:508] "Japan" "Japan" "Japan" "Japan" ...
## $ Arrivals: int [1:508] 14763 9321 10166 19509 17117 10617 11737 20961 20671 12235 ...
## - attr(*, "key")= tibble [4 × 2] (S3: tbl_df/tbl/data.frame)
## ..$ Origin: chr [1:4] "Japan" "NZ" "UK" "US"
## ..$ .rows : list<int> [1:4]
## .. ..$ : int [1:127] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..$ : int [1:127] 128 129 130 131 132 133 134 135 136 137 ...
## .. ..$ : int [1:127] 255 256 257 258 259 260 261 262 263 264 ...
## .. ..$ : int [1:127] 382 383 384 385 386 387 388 389 390 391 ...
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
## - attr(*, "index")= chr "Quarter"
## ..- attr(*, "ordered")= logi TRUE
## - attr(*, "index2")= chr "Quarter"
## - attr(*, "interval")= interval [1:1] 1Q
## ..@ .regular: logi TRUE
aus_arrivals %>%
autoplot(Arrivals)
The data looks seasonal. The number of flights to NZ has continued to
increase at a steady rate. There were increasing numbers of flights to
Japan until the mid 90s. The decline has been steady since. Flights to
the US have been increasing but at a slower rate than other countries.
The flights to the UK have also been increasing over time and seem to be
largly affected by the season.
aus_arrivals %>%
gg_season(Arrivals)
Same observations as above. The color pattern difference shown in Japan
is more noticable here. The demand of Q1 and Q4 from UK is also very
notable.
aus_arrivals %>%
gg_subseries(Arrivals)
Consider the last five years of the Gas data from aus_production.
gas <- tail(aus_production, 5*4) |> select(Gas)
gas %>%
autoplot(Gas)
Seasonal pattern where gas demand is highest Q2 and Q3.
gas %>%
model(
classical_decomposition(Gas, type = "multiplicative")
) %>%
components() %>%
autoplot()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
Do the results support the graphical interpretation from part a? Yes the results support a seasonal element to the data.
Compute and plot the seasonally adjusted data.
gas_model = gas %>%
model(
classical_decomposition(Gas, type = "multiplicative")
) %>%
components()
gas_model %>%
as_tsibble() %>%
autoplot(Gas, colour = "grey") +
geom_line(aes(y = season_adjust), colour = "blue")
gas$Gas[20] = 300
gas_model = gas %>%
model(
classical_decomposition(Gas, type = "multiplicative")
) %>%
components()
gas_model %>%
as_tsibble() %>%
autoplot(Gas, colour = "grey") +
geom_line(aes(y = season_adjust), colour = "blue")
It draged the adjusted line up at the point of the outlier.
gas$Gas[10] = 300
gas_model = gas %>%
model(
classical_decomposition(Gas, type = "multiplicative")
) %>%
components()
gas_model %>%
as_tsibble() %>%
autoplot(Gas, colour = "grey") +
geom_line(aes(y = season_adjust), colour = "blue")
When it is in the middle the adjustment to seasonality is much less
apparent than when it was at the end.
For your retail time series (from Exercise 7 in Section 2.10):
set.seed(23)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
fit <- myseries_train |>
model(SNAIVE(Turnover))
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
Do the residuals appear to be uncorrelated and normally distributed? The
mean of the residuals does not seem close to zeros suggesting that they
may be correlated. The histogram does suggest that they are normally
distributed.
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
fit |> accuracy()
fc |> accuracy(myseries)
The accuracy measures are quite sensitive to the amount of training data used.
The annual population of Afghanistan is available in the global_economy data set.
ge_df = global_economy
afghan = ge_df %>%
filter(Country == "Afghanistan") %>%
select(-Code, -Country)
afghan %>%
autoplot(Population)
The war caused a noticeable dip in population.
afghan_models = afghan %>%
model(
linear = TSLM(Population ~ trend()),
piecewise = TSLM(Population ~ trend(knots = c(1980, 1989)))
)
afghan %>%
autoplot(Population) +
geom_line(data = fitted(afghan_models), aes(y = .fitted, colour = .model)) +
labs(x = "Year", title = "Afghan Population over Time")
fc_afghan_models = afghan_models %>%
forecast(h = 5)
afghan %>%
autoplot(Population) +
geom_line(data = fitted(afghan_models), aes(y = .fitted, colour = .model)) +
autolayer(fc_afghan_models, alpha = .5, level = 95) +
labs(x = "Year", title = "Afghan Population over Time")
Data set global_economy contains the annual Exports from many countries. Select one country to analyse.
germany = ge_df %>%
filter(Country == "Germany", !is.na(Exports)) %>%
select(-Code, -Country)
germany %>%
autoplot(Exports)
There is a steady climb in exports from 1970-1990 with a few spikes.
Then around 1993 there is a drop off, followed by a rapid climb until
there is another dip in 2008-2009 then it recovers.
germany_fit <- germany |>
model(ETS(Exports ~ error("A") + trend("N") + season("N")))
fc_germany <- germany_fit |>
forecast(h = 5)
fc_germany %>%
autoplot(germany) +
geom_line(data = augment(germany_fit), aes(y = .fitted, colour = .model)) +
labs(x = "Year", title = "Germany's Exports") +
guides(colour = "none")
germany |>
stretch_tsibble(.init = 10) |>
model(
SES = ETS(Exports ~ error("A") + trend("N") + season("N")),
Holt = ETS(Exports ~ error("A") + trend("A") + season("N"))
) |>
forecast(h = 5) |>
accuracy(germany)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 5 observations are missing between 2018 and 2022
Compare the results to those from an ETS(A,A,N) model. (Remember that the trended model is using one more parameter than the simpler model.) Discuss the merits of the two forecasting methods for this data set. The Holt’s model with trend has a lower RMSE than the SES model.
Compare the forecasts from both methods. Which do you think is best?
germany |>
model(
SES = ETS(Exports ~ error("A") + trend("N") + season("N")),
Holt = ETS(Exports ~ error("A") + trend("A") + season("N"))
) |>
forecast(h = 5) |>
autoplot(germany, level = NULL)
I think the Holt’s AAN is better because it takes trend into
account.
germany_ANN = germany |>
model(ETS(Exports ~ error("A") + trend("N") + season("N")))
germany_AAN = germany |>
model(ETS(Exports ~ error("A") + trend("A") + season("N")))
germany_ANN %>%
forecast(h =5) %>%
hilo(95) %>%
mutate(lower = `95%`$lower, upper = `95%`$upper) %>%
select(lower, .mean, upper)
ANN_RMSE <- accuracy(germany_ANN) |>
pull(RMSE)
#ANN Lower
47.2 - 1.96*ANN_RMSE
## [1] 43.75166
#ANN Upper
47.2 + 1.96*ANN_RMSE
## [1] 50.64834
germany_AAN %>%
forecast(h =5) %>%
hilo(95) %>%
mutate(lower = `95%`$lower, upper = `95%`$upper) %>%
select(lower, .mean, upper)
AAN_RMSE <- accuracy(germany_AAN) |>
pull(RMSE)
#ANN Lower
47.2 - 1.96*AAN_RMSE
## [1] 44.01511
#ANN Upper
47.2 + 1.96*AAN_RMSE
## [1] 50.38489
When comparing the intervals we see that our manual predictions are pretty close to what is generated by R.
Simulate and plot some data from simple ARIMA models.
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.6*y[i-1] + e[i]
sim <- tsibble(idx = seq_len(100), y = y, index = idx)
set.seed(23)
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.6*y[i-1] + e[i]
sim1 <- tsibble(idx = seq_len(100), y = y, index = idx)
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.1*y[i-1] + e[i]
sim2 <- tsibble(idx = seq_len(100), y = y, index = idx)
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- .9*y[i-1] + e[i]
sim3 <- tsibble(idx = seq_len(100), y = y, index = idx)
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- -0.6*y[i-1] + e[i]
sim4 <- tsibble(idx = seq_len(100), y = y, index = idx)
sim1 %>%
autoplot(y)
sim2 %>%
autoplot(y)
sim3 %>%
autoplot(y)
sim4 %>%
autoplot(y)
As \(\phi_1\) gets lower the variation from one in time point to the next increases. That is to say, the previous y value has less affect on the current one.
set.seed(23)
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- -0.6*e[i-1] + e[i]
sim_ma1 <- tsibble(idx = seq_len(100), y = y, index = idx)
for(i in 2:100)
y[i] <- 0.1*e[i-1] + e[i]
sim_ma2 <- tsibble(idx = seq_len(100), y = y, index = idx)
for(i in 2:100)
y[i] <- 0.6*e[i-1] + e[i]
sim_ma3 <- tsibble(idx = seq_len(100), y = y, index = idx)
for(i in 2:100)
y[i] <- 0.9*e[i-1] + e[i]
sim_ma4 <- tsibble(idx = seq_len(100), y = y, index = idx)
sim_ma1 %>%
autoplot(y)
sim_ma2 %>%
autoplot(y)
sim_ma3 %>%
autoplot(y)
sim_ma4 %>%
autoplot(y)
ma1_stats = data.frame(
model = "MA(1) phi1 = -0.6",
mean = mean(sim_ma1$y),
sd = sd(sim_ma1$y)
)
ma2_stats = data.frame(
model = "MA(1) phi1 = 0.1",
mean = mean(sim_ma2$y),
sd = sd(sim_ma2$y)
)
ma3_stats = data.frame(
model = "MA(1) phi1 = 0.6",
mean = mean(sim_ma3$y),
sd = sd(sim_ma3$y)
)
ma4_stats = data.frame(
model = "MA(1) phi1 = 0.9",
mean = mean(sim_ma4$y),
sd = sd(sim_ma4$y)
)
ma1_model_stats = rbind(ma1_stats, ma2_stats, ma3_stats, ma4_stats)
ma1_model_stats
Interestingly, as phi got larger so did the mean of y. The larger phi, the more the previous y’s error term affects the current y.
set.seed(23)
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.6*y[i-1] + 0.6*e[i-1] + e[i]
sim_arma11 <- tsibble(idx = seq_len(100), y = y, index = idx)
set.seed(23)
y <- numeric(100)
e <- rnorm(100)
for(i in 3:100)
y[i] <- -0.8*y[i-1] + 0.3*y[i-2] + e[i]
sim_ar2 <- tsibble(idx = seq_len(100), y = y, index = idx)
sim_arma11 %>%
autoplot(y)
sim_ar2 %>%
autoplot(y)
AR(2) shows exponential growth.