library(fpp3)
## Warning: package 'fpp3' was built under R version 4.4.1
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.3 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## Warning: package 'tsibble' was built under R version 4.4.1
## Warning: package 'feasts' was built under R version 4.4.1
## Warning: package 'fabletools' was built under R version 4.4.1
## Warning: package 'fable' was built under R version 4.4.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggplot2)
library(fabletools)
5.1 Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:
Australian Population (global_economy) Bricks (aus_production) NSW Lambs (aus_livestock) Household wealth (hh_budget). Australian takeaway food turnover (aus_retail).
#Australian Population from global_economy
Aus_pop<-global_economy|>
filter(Country=="Australia")
autoplot(Aus_pop, Population)+
labs(title = "Global economy population of Australia", y= "Population")# no seasonal but it has an upward trend, best to use RW(y ~ drift()).
Aus_popfit<-Aus_pop|>
model(
RW(Population~drift()) #random walk on a graph
)|>
forecast(h="5 years")
Aus_popfit|>
autoplot(Aus_pop)+
labs(title = "Global economy population of Australia",
y="Population")
#Brick in aus production, which has seasonality. SNAIVE(y) is most appropriate.
aus_brick<-aus_production|>
filter(!is.na(Bricks))|> #remove rows with no bricks data
model(
SNAIVE(Bricks)
)|>
forecast(h="5 years")
aus_brick|>
autoplot(aus_production)+
labs(title = "Brick production in Australia",
y="Millions of bricks")
#NSW Lamb from aus_livestock
head(aus_livestock)
## # A tsibble: 6 x 4 [1M]
## # Key: Animal, State [1]
## Month Animal State Count
## <mth> <fct> <fct> <dbl>
## 1 1976 Jul Bulls, bullocks and steers Australian Capital Territory 2300
## 2 1976 Aug Bulls, bullocks and steers Australian Capital Territory 2100
## 3 1976 Sep Bulls, bullocks and steers Australian Capital Territory 2100
## 4 1976 Oct Bulls, bullocks and steers Australian Capital Territory 1900
## 5 1976 Nov Bulls, bullocks and steers Australian Capital Territory 2100
## 6 1976 Dec Bulls, bullocks and steers Australian Capital Territory 1800
NSW_lamb<-aus_livestock|>
filter(Animal=="Lambs", State=="New South Wales")
NSW_lamb|>
autoplot(Count)+
labs(title = "Count of Slaughter Lamb in New South Wales", y="Count of slaughter")
#Not a clear seasonality is viewed, and there is no trend, therefore NAIVE(y) would be best.
NSW_lambfit<-NSW_lamb|>
model(NAIVE(Count))|>
forecast(h="10 years")
NSW_lambfit|>
autoplot(NSW_lamb)+
labs(title = "Count of Slaughter Lamb in New South Wales", y="Count of slaughter")
#Household wealth (hh_budget)
Householdw<-hh_budget|>
filter(!is.na(Wealth))|>
select(c(Year,Wealth))
Householdw|>
autoplot(Wealth)+
labs(title = "Household Wealth",y= "Wealth")
#The plot shows an upwards trend no true seasonality, therefore Drift would be best.
Householdwfit<-Householdw|>
model(RW(Wealth~drift()))|>
forecast(h="5 years")
Householdwfit|>
autoplot(Householdw)+
labs(title = "Household Wealth",y= "Wealth")
#Australian takeaway food turnover (aus_retail)
Aus_takeaway<-aus_retail|>
filter(State=="Australian Capital Territory", Industry=="Takeaway food services")|>
select(c(Month, Turnover))
Aus_takeaway|>
autoplot()+
labs(title = "Austrialian Food Takeaway", y="Turnover")
#The plot shows an upwards trend, no true seasonality I believe drift would be best
Aus_takeawayfit<-Aus_takeaway|>
model(RW(Turnover~drift()))|>
forecast(h="2 years")
Aus_takeawayfit|>
autoplot(Aus_takeaway)+
labs(title = "Austrialian Food Takeaway", y="Turnover")
5.2 Use the Facebook stock price (data set gafa_stock) to do the following:
Produce a time plot of the series. Produce forecasts using the drift method and plot them. Show that the forecasts are identical to extending the line drawn between the first and last observations. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
fb_stock<-gafa_stock|>
filter(Symbol=="FB")|>
mutate(trading_day=row_number())|>
update_tsibble(index=trading_day, regular=TRUE)
fb_stock|>
autoplot(Close)+
labs(title = "Facebook Closing Stock [Time series]")
fb_stock|>
model(RW(Close~drift()))|>
forecast(h=42)|>
autoplot(fb_stock)+
labs(title="Facebook Closing Stock [Forecast]")
fb_stock|>
model(RW(Close~drift()))|>
forecast(h=42)|>
autoplot(fb_stock)+
labs(title="Facebook Closing Stock [Forecast]")+
geom_segment(aes(x = trading_day[1], y = Close[1], xend = tail(fb_stock$trading_day, 1), yend = tail(fb_stock$Close, 1)))
## Warning: Use of `fb_stock$trading_day` is discouraged.
## ℹ Use `trading_day` instead.
## Warning: Use of `fb_stock$Close` is discouraged.
## ℹ Use `Close` instead.
## Warning in geom_segment(aes(x = trading_day[1], y = Close[1], xend = tail(fb_stock$trading_day, : All aesthetics have length 1, but the data has 1258 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
fb_stock|>
model(Drift=RW(Close~drift()),
Mean=MEAN(Close),
Naive=NAIVE(Close))|>
forecast(h=20)|>
autoplot(fb_stock, level=NULL)+
labs(title="Facebook Closing Stock [Forecast2]")+
guides(color=guide_legend(title = "Forecast"))
The facebook stock plot didn’t show seasonality but it showed an upwards
trend. The best model would be drift as it showed a more closely related
forecast.
5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
fit|>
augment() |>
features(.innov, box_pierce, lag=10)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 SNAIVE(Beer) 34.4 0.000160
fit|>
augment() |>
features(.innov, ljung_box, lag=10)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 SNAIVE(Beer) 37.8 0.0000412
What do you conclude? The time plot shows there residual variance is not constant and the residual histogram has a mean other than zero and the not normally distributed. data doesn’t seem to have much white noise therefore season naive would be appropriate to use. I used used box pierce and ljung box to compare the p values and the results are no significant, therefore the residuals are not distinguishable from a white noise series.
5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
head(global_economy)
## # A tsibble: 6 x 9 [1Y]
## # Key: Country [1]
## Country Code Year GDP Growth CPI Imports Exports Population
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan AFG 1960 537777811. NA NA 7.02 4.13 8996351
## 2 Afghanistan AFG 1961 548888896. NA NA 8.10 4.45 9166764
## 3 Afghanistan AFG 1962 546666678. NA NA 9.35 4.88 9345868
## 4 Afghanistan AFG 1963 751111191. NA NA 16.9 9.17 9533954
## 5 Afghanistan AFG 1964 800000044. NA NA 18.1 8.89 9731361
## 6 Afghanistan AFG 1965 1006666638. NA NA 21.4 11.3 9938414
Aus_export <- global_economy |>
filter(Country=="Australia")|>
select(Year,Exports)
Aus_exportfit <- Aus_export |> model(NAIVE(Exports))
Aus_exportfit |> gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
Aus_exportfit |> forecast() |> autoplot(Aus_export)
SNAIVE didn’t work, as there is no seasonality. Therefore the best model
to use would be NAIVE. The residuals plot seems to be constant and close
the mean is at zero, meaning there is no significant correlation in
residual series.
brick_production <- aus_production |>
filter(!is.na(Bricks))|>
select(c(Quarter,Bricks))
# Define and estimate a model
brick_fit <- brick_production |> model(SNAIVE(Bricks))
# Look at the residuals
brick_fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
brick_fit |> forecast() |> autoplot(brick_production)
The data for brick from aus_production has seasonality, therefore the best model to use would be SNAIVE.The time plot of the residual shows that the residual is not constant. Based on the residual histogram seems to be skewed to the left, meaning the residual is not normal.
5.7 For your retail time series (from Exercise 7 in Section 2.10):
set.seed(12345678)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
#Create a training dataset consisting of observations before 2011 using
myseries_train <- myseries |>
filter(year(Month) < 2011)
#Check that your data have been split appropriately by producing the following plot
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
#Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
fit <- myseries_train |>
model(SNAIVE(Turnover))
#Check the residuals.
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Do the residuals appear to be uncorrelated and normally distributed? Answered below.
#Produce forecasts for the test data
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
#Compare the accuracy of your forecasts against the actual values.
fit |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439 1.21 0.915 5.23 12.4 1 1 0.768
fc |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test 0.836 1.55 1.24 5.94 9.06 1.36 1.28 0.601
Do the residuals appear to be uncorrelated and normally distributed? How sensitive are the accuracy measures to the amount of training data used? Based on the ACF plot the residual seems to be uncorrelated and the histogram shows the residual is not normally distributed as it is slighted skewed to the left. The the accuracy measures better for the trained data as the MASE was closer to zero compared to the test, but overall the model forecast model isn’t accurate because the MASE is greater 1.