Do exercises 5.1, 5.2, 5.3, 5.4 and 5.7 in the Hyndman book. Please submit your Rpubs link as well as your .pdf file showing your run code.
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.5
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.3.2
## ✔ lubridate 1.9.3 ✔ fable 0.3.4
## ✔ ggplot2 3.5.1 ✔ fabletools 0.4.2
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tsibble' was built under R version 4.3.3
## Warning: package 'tsibbledata' was built under R version 4.3.3
## Warning: package 'feasts' was built under R version 4.3.3
## Warning: package 'fabletools' was built under R version 4.3.3
## Warning: package 'fable' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ purrr 1.0.2 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(latex2exp)
## Warning: package 'latex2exp' was built under R version 4.3.3
library(seasonal)
## Warning: package 'seasonal' was built under R version 4.3.3
##
## Attaching package: 'seasonal'
##
## The following object is masked from 'package:tibble':
##
## view
library(seasonalview)
## Warning: package 'seasonalview' was built under R version 4.3.3
##
## Attaching package: 'seasonalview'
##
## The following object is masked from 'package:seasonal':
##
## view
##
## The following object is masked from 'package:tibble':
##
## view
#loaded relevant libraries
5.1) Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:
Australian Population (global_economy) Bricks (aus_production) NSW Lambs (aus_livestock) Household wealth (hh_budget). Australian takeaway food turnover (aus_retail).
#first visuallizing the data
head(global_economy)
## # A tsibble: 6 x 9 [1Y]
## # Key: Country [1]
## Country Code Year GDP Growth CPI Imports Exports Population
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan AFG 1960 537777811. NA NA 7.02 4.13 8996351
## 2 Afghanistan AFG 1961 548888896. NA NA 8.10 4.45 9166764
## 3 Afghanistan AFG 1962 546666678. NA NA 9.35 4.88 9345868
## 4 Afghanistan AFG 1963 751111191. NA NA 16.9 9.17 9533954
## 5 Afghanistan AFG 1964 800000044. NA NA 18.1 8.89 9731361
## 6 Afghanistan AFG 1965 1006666638. NA NA 21.4 11.3 9938414
global_economy |>
filter(Country == "Australia") |> # selecting australia from global economy
autoplot(Population) +
labs(y = "Population", title = "Australian Population")
# data appears to be relatively linear. will apply a drift forecast
Aus_pop <- global_economy |>
filter(Country == "Australia") |> # selecting australia from global economy
select(c("Country", "Year","Population")) #selecting population data and time
Aus_pop_training <- Aus_pop |>
filter_index("1960"~"2016") #selecting a subset of the data to train with. selected most of the data minus one year so that the forecast and the original graph connect
aus_pop_drift <- Aus_pop_training |> model(RW(Population ~ drift())) #take the training data and train the drift model
aus_pop_forecast <- aus_pop_drift |>
forecast(h = "20 years") #make the forecast model go out to 20 years.
Aus_pop |> autoplot(Population) +
aus_pop_forecast |> autolayer(filter_index(Aus_pop, "2017"~ .)) #graph the forecast from 2017.
labs(y = "Population", title = "Australia population")
## $y
## [1] "Population"
##
## $title
## [1] "Australia population"
##
## attr(,"class")
## [1] "labels"
the forecast predicts the next 20 years. though the lower slope of 1960 until 2006 predominates the predicted slope because more data is at the lower slope. after 2006 the slope increases. showing another forecast just based on the most recent years shows the difference in these slopes
Aus_pop_training_short <- Aus_pop |>
filter_index("2006"~"2016") #selecting a subset of the data to train with only based on the most recent years.
aus_pop_drift_short <- Aus_pop_training_short |> model(RW(Population ~ drift())) #take the shorter training data and train the drift model
aus_pop_forecast_short <- aus_pop_drift_short |>
forecast(h = "20 years") #make the forecast model go out to 20 years.
Aus_pop |> autoplot(Population) +
aus_pop_forecast_short |> autolayer(filter_index(Aus_pop, "2017"~ .)) #graph the forecast from 2017.
labs(y = "Population", title = "Australia population")
## $y
## [1] "Population"
##
## $title
## [1] "Australia population"
##
## attr(,"class")
## [1] "labels"
the slope dramatically increases based on the slope of the more recent years training data.
#first visuallizing the data
head(aus_production)
## # A tsibble: 6 x 7 [1Q]
## Quarter Beer Tobacco Bricks Cement Electricity Gas
## <qtr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1956 Q1 284 5225 189 465 3923 5
## 2 1956 Q2 213 5178 204 532 4436 6
## 3 1956 Q3 227 5297 208 561 4806 7
## 4 1956 Q4 308 5681 197 570 4418 6
## 5 1957 Q1 262 5577 187 529 4339 5
## 6 1957 Q2 228 5651 214 604 4811 7
aus_production |>
select("Quarter", "Bricks") |> # selecting Bricks produced per quarter
autoplot(Bricks) +
labs(y = "Bricks", title = "Australian Bricks")
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
There is a clear seasonality to the data focusing on just the drift will not provide a good prediction. a snaive should provide some decent predictions
bricks <- aus_production |> select("Quarter", "Bricks") |> filter(!is.na(Bricks)) #select the bricks data remove the last few years with na values.
bricks_train <- bricks |> filter_index(. ~"2001 Q1") #cutting off the last few years to see how the model may predict the last 5 years
bricks_model <- bricks_train |> model(SNAIVE(Bricks ~ lag("5 years"))) #forecast based on the past 5 years
bricks_predict <- bricks_model |>
forecast(h = "5 years") #make the forecast model go out to 5 years.
bricks |> autoplot(Bricks) +
bricks_predict |> autolayer(filter_index(bricks), alpha = 0.5) + #graph the forecast from 2000. alpha was added to make it more transparent
labs(y = "Bricks", title = "Australia Bricks")
The forecast follows the seasonality of the data quite well. There is a large error prediction associated with it due the the fluctuations in the data. overall it follows the trend of the last 5 years specified. Esentially it is a copy and paste of the last five years from the training data set.
##NSW Lambs (aus_livestock)
#first visuallizing the data
head(aus_livestock)
## # A tsibble: 6 x 4 [1M]
## # Key: Animal, State [1]
## Month Animal State Count
## <mth> <fct> <fct> <dbl>
## 1 1976 Jul Bulls, bullocks and steers Australian Capital Territory 2300
## 2 1976 Aug Bulls, bullocks and steers Australian Capital Territory 2100
## 3 1976 Sep Bulls, bullocks and steers Australian Capital Territory 2100
## 4 1976 Oct Bulls, bullocks and steers Australian Capital Territory 1900
## 5 1976 Nov Bulls, bullocks and steers Australian Capital Territory 2100
## 6 1976 Dec Bulls, bullocks and steers Australian Capital Territory 1800
unique(aus_livestock$State) #assuming "New South Wales" is NSW
## [1] Australian Capital Territory New South Wales
## [3] Northern Territory Queensland
## [5] South Australia Tasmania
## [7] Victoria Western Australia
## 8 Levels: Australian Capital Territory New South Wales ... Western Australia
unique(aus_livestock$Animal) #syntax for Lambs is "Lambs"
## [1] Bulls, bullocks and steers Calves
## [3] Cattle (excl. calves) Cows and heifers
## [5] Lambs Pigs
## [7] Sheep
## 7 Levels: Bulls, bullocks and steers Calves ... Sheep
NSW_lambs <- aus_livestock |>
filter(Animal == "Lambs", State == "New South Wales") # selecting Bricks produced per quarter
NSW_lambs |> autoplot(Count) +
labs(y = "Lambs", title = "NSW Lambs")
upon first glance there does not seem to be a clear pattern to the data. I believe naive should be the best approach especially since the drift slope is not very high.
lambs_naive <- NSW_lambs |> filter_index(. ~"2015 Jan") |> model(NAIVE(Count)) #forecast based on the past
lamb_forecast <- lambs_naive |>
forecast(h = "5 years") #make the forecast model go out to 5 years.
NSW_lambs |> autoplot(Count) +
lamb_forecast |> autolayer(filter_index(Count), alpha = 0.4) + #graph the forecast from 2015.
labs(y = "Lambs", title = "NSW lamb")
even with the large level assoicated with tit overall it follows the data well especially at the scale shown.
##Household wealth (hh_budget).
head(hh_budget)
## # A tsibble: 6 x 8 [1Y]
## # Key: Country [1]
## Country Year Debt DI Expenditure Savings Wealth Unemployment
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Australia 1995 95.7 3.72 3.40 5.24 315. 8.47
## 2 Australia 1996 99.5 3.98 2.97 6.47 315. 8.51
## 3 Australia 1997 108. 2.52 4.95 3.74 323. 8.36
## 4 Australia 1998 115. 4.02 5.73 1.29 339. 7.68
## 5 Australia 1999 121. 3.84 4.26 0.638 354. 6.87
## 6 Australia 2000 126. 3.77 3.18 1.99 350. 6.29
household_wealth <- hh_budget |>filter(Country=="Australia") |> select(Year, Wealth)
#selecting the australian household wealth
household_wealth |> autoplot(Wealth) +
labs(y = "wealth", title = "Aus household wealth")
#attempting a seasonal to see how well it predicts
wealth_train <- household_wealth |> filter_index(. ~"2006") #attempting to see how well the first 11 years predicts the next 11 years
wealth_model <- wealth_train |> model(SNAIVE(Wealth ~ lag("11 years"))) #forecast based on the past 11 years
wealth_predict <- wealth_model |>
forecast(h = "11 years") #make the forecast model go out to 11 years.
household_wealth |> autoplot(Wealth) +
wealth_predict |> autolayer(filter_index(Wealth), alpha = 0.4) + #graph the forecast from the year the training data ends (2006). alpha was added to make it more transparent
labs(y = "Wealth", title = "Australia wealth")
Taking an 11 year seasonal approach yielded some interesting results. it shows that the previous 11 years was not a bad predictor of the last 11 years.
##Australian takeaway food turnover (aus_retail).
head(aus_retail)
## # A tsibble: 6 x 5 [1M]
## # Key: State, Industry [1]
## State Industry `Series ID` Month Turnover
## <chr> <chr> <chr> <mth> <dbl>
## 1 Australian Capital Territory Cafes, restaurants… A3349849A 1982 Apr 4.4
## 2 Australian Capital Territory Cafes, restaurants… A3349849A 1982 May 3.4
## 3 Australian Capital Territory Cafes, restaurants… A3349849A 1982 Jun 3.6
## 4 Australian Capital Territory Cafes, restaurants… A3349849A 1982 Jul 4
## 5 Australian Capital Territory Cafes, restaurants… A3349849A 1982 Aug 3.6
## 6 Australian Capital Territory Cafes, restaurants… A3349849A 1982 Sep 4.2
unique(aus_retail$Industry) #"Takeaway food services"
## [1] "Cafes, restaurants and catering services"
## [2] "Cafes, restaurants and takeaway food services"
## [3] "Clothing retailing"
## [4] "Clothing, footwear and personal accessory retailing"
## [5] "Department stores"
## [6] "Electrical and electronic goods retailing"
## [7] "Food retailing"
## [8] "Footwear and other personal accessory retailing"
## [9] "Furniture, floor coverings, houseware and textile goods retailing"
## [10] "Hardware, building and garden supplies retailing"
## [11] "Household goods retailing"
## [12] "Liquor retailing"
## [13] "Newspaper and book retailing"
## [14] "Other recreational goods retailing"
## [15] "Other retailing"
## [16] "Other retailing n.e.c."
## [17] "Other specialised food retailing"
## [18] "Pharmaceutical, cosmetic and toiletry goods retailing"
## [19] "Supermarket and grocery stores"
## [20] "Takeaway food services"
unique(aus_retail$State)
## [1] "Australian Capital Territory" "New South Wales"
## [3] "Northern Territory" "Queensland"
## [5] "South Australia" "Tasmania"
## [7] "Victoria" "Western Australia"
#not sure which state it wants assuming New South Wales like in the lamb example
takeaway_turn <- aus_retail |>filter(Industry=="Takeaway food services", State == "New South Wales") |> select(Month, Turnover)
#selecting the australian takeaway turnover
takeaway_turn |> autoplot(Turnover) +
labs(y = "Turnover", title = "Takeaway food turnover")
#check drift and seasonal
turn_train <- takeaway_turn |>
filter_index("2000 Jan"~"2011 Jan") #selecting a subset of the data to train with only based on the previous decade or so.
turn_model <- turn_train |> model(RW(Turnover ~ drift())) #take the 11 years of training data and train the drift model
turn_forecast <- turn_model |>
forecast(h = "10 years") #make the forecast model go out to 10 years.
takeaway_turn |> autoplot(Turnover) +
turn_forecast |> autolayer(filter_index(Turnover), alpha = 0.4) + #graph the forecast
labs(y = "Turnover", title = "takeaway turnover NSW")
well the actual data is well within the predicted levels of the drift
model.
5.2) Use the Facebook stock price (data set gafa_stock) to do the following:
Produce a time plot of the series. Produce forecasts using the drift method and plot them. Show that the forecasts are identical to extending the line drawn between the first and last observations. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
head(gafa_stock)
## # A tsibble: 6 x 8 [!]
## # Key: Symbol [1]
## Symbol Date Open High Low Close Adj_Close Volume
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL 2014-01-02 79.4 79.6 78.9 79.0 67.0 58671200
## 2 AAPL 2014-01-03 79.0 79.1 77.2 77.3 65.5 98116900
## 3 AAPL 2014-01-06 76.8 78.1 76.2 77.7 65.9 103152700
## 4 AAPL 2014-01-07 77.8 78.0 76.8 77.1 65.4 79302300
## 5 AAPL 2014-01-08 77.0 77.9 77.0 77.6 65.8 64632400
## 6 AAPL 2014-01-09 78.1 78.1 76.5 76.6 65.0 69787200
unique(gafa_stock$Symbol) # assuming FB is facebook
## [1] "AAPL" "AMZN" "FB" "GOOG"
FB_stock <- gafa_stock |> filter(Symbol == "FB") |> select(Date, Open) # chose open for the stock prices.
#FB_stock$Date <- ymd(FB_stock$Date)
FB_stock |> autoplot(Open) +
labs(y="Open Stock price", title = "FB Stock price")
FB_stock <- FB_stock |> update_tsibble(regular = TRUE)
FB_stock <- FB_stock |> fill_gaps() # fill gaps and regular = True required to avoid the model from erroring out.
FB_model <- FB_stock |> model(RW(Open ~ drift())) # all the data was used as training for the drift model
#FB_model <- FB_model |> update_tsibble(regular = TRUE)
FB_forecast <- FB_model |>
forecast(h = "2 years") #make the forecast model go out to 2 years.
FB_stock |> autoplot(Open) +
FB_forecast |> autolayer(filter_index(Open)) + #graph the forecast
labs(y = "Open Stock price", title = "FB stock price drift")
Model is added to the graph
Show that the forecasts are identical to extending the line drawn between the first and last observations.
first <- FB_stock |> slice(1) #picking the fist point in the data by row number
last <- FB_stock |> slice(n()) #picking the last point in the data by row number
#putting the first and last point in a dataframe so that a line can be drawn between them later. keeping the column names the same
firstlast <- data.frame(
Date = c(first$Date, last$Date) ,
Open = c(first$Open, last$Open )
)
#graphing the FB stock price
FB_stock |> autoplot(Open) +
#graph the line through the first and last point
geom_line(data = firstlast, aes( data=firstlast, x = Date, y = Open), color = "blue", linetype = "dashed", size = 2 ) +
FB_forecast |> autolayer(filter_index(Open)) + #graph the forecast
labs(y = "Open", title = "FB Stock Price")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in geom_line(data = firstlast, aes(data = firstlast, x = Date, y =
## Open), : Ignoring unknown aesthetics: data
## Don't know how to automatically pick scale for object of type <data.frame>.
## Defaulting to continuous.
Try using some of the other benchmark functions to forecast the same
data set. Which do you think is best? Why?
#simple benchmark mean takes the mean of the data
FB_mean_model <- FB_stock |> model(MEAN(Open))
FB_meanforecast <- FB_mean_model |>
forecast(h = "2 years")
FB_stock |> autoplot(Open) +
FB_meanforecast |> autolayer(filter_index(Open)) + #graph the forecast
labs(y = "Open Stock price", title = "FB stock price mean")
#simple benchmark naive takes last point.
FB_naive_model <- FB_stock |> model(NAIVE(Open))
FB_naiveforecast <- FB_naive_model |>
forecast(h = "2 years")
FB_stock |> autoplot(Open) +
FB_naiveforecast |> autolayer(filter_index(Open)) + #graph the forecast
labs(y = "Open Stock price", title = "FB stock price naive")
I think the naive out of the simple benchmarks is best due to being
close to the last stock price and allowing fluctuations in the price as
long there is not a signficant drift. The drift is not consistant due to
the rising and falling of the stock price.
##5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
recent_production <- aus_production |> filter(year(Quarter) >= 1992) # Define and estimate a model fit <- recent_production |> model(SNAIVE(Beer)) # Look at the residuals fit |> gg_tsresiduals() # Look a some forecasts fit |> forecast() |> autoplot(recent_production) What do you conclude?
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
#take the model for pvalue residual analysis Portmanteau tests for autocorrelation
aug1 <- fit |>
augment()
pvalue1 <- aug1 |> features(.resid, ljung_box, lag = 12) #3 x 4 quarters
print(pvalue1)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 SNAIVE(Beer) 38.7 0.000116
There is a couple of residuals outside the bounds of the acf graph which indicates there may be some autocorrelation meaning the residuals are outside the bounds of white noise. the distribution is not perfectly normal. applying a ljung_box Portmanteau test gives us a p value less than 0.05 which means we can reject the null hypothesis of the residuals being consistant with white noise.
##5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
bricks_model2 <- bricks |> model(SNAIVE(Bricks ~ lag("5 years"))) #forecast based on the past 5 years
bricks_model2 |> gg_tsresiduals()
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 20 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
bricks_model2 |> forecast() |> autoplot(bricks)
#take the model for pvalue residual analysis Portmanteau tests for autocorrelation
aug2 <- bricks_model2 |>
augment()
pvalue2 <- aug2 |> features(.resid, ljung_box, lag = 60)
print(pvalue2)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 "SNAIVE(Bricks ~ lag(\"5 years\"))" 1205. 0
For the bricks model there is a large amount of autocorrelation. the p value is “0”. this is not a good model.
head(global_economy)
## # A tsibble: 6 x 9 [1Y]
## # Key: Country [1]
## Country Code Year GDP Growth CPI Imports Exports Population
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan AFG 1960 537777811. NA NA 7.02 4.13 8996351
## 2 Afghanistan AFG 1961 548888896. NA NA 8.10 4.45 9166764
## 3 Afghanistan AFG 1962 546666678. NA NA 9.35 4.88 9345868
## 4 Afghanistan AFG 1963 751111191. NA NA 16.9 9.17 9533954
## 5 Afghanistan AFG 1964 800000044. NA NA 18.1 8.89 9731361
## 6 Afghanistan AFG 1965 1006666638. NA NA 21.4 11.3 9938414
aus_ex <- global_economy |> filter(Country == "Australia") |> select(Year, Exports)
aus_ex |> autoplot(Exports)
#trying a naive
aus_ex_model <- aus_ex |> model(NAIVE(Exports))
aus_ex_model |> forecast() |> autoplot(aus_ex)
aus_ex_model |> gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
#take the model for pvalue residual analysis Portmanteau tests for autocorrelation
aug3 <- aus_ex_model |>
augment()
pvalue3 <- aug3 |> features(.resid, ljung_box, lag = 10)
print(pvalue3)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 NAIVE(Exports) 16.4 0.0896
For the naive the distrubtion of the residuals is normal. there is one residual outside the autocorrelation function. however the p value is high at 0.09 for a lag of 10. The naive appears to be a good fit due to the residuals appearing to be like white noise.
##5.7
For your retail time series (from Exercise 7 in Section 2.10):
Create a training dataset consisting of observations before 2011 using
myseries_train <- myseries |> filter(year(Month) < 2011) Check that your data have been split appropriately by producing the following plot.
autoplot(myseries, Turnover) + autolayer(myseries_train, Turnover, colour = “red”) Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
fit <- myseries_train |> model(SNAIVE()) Check the residuals.
fit |> gg_tsresiduals() Do the residuals appear to be uncorrelated and normally distributed?
Produce forecasts for the test data
fc <- fit |> forecast(new_data = anti_join(myseries, myseries_train)) fc |> autoplot(myseries) Compare the accuracy of your forecasts against the actual values.
fit |> accuracy() fc |> accuracy(myseries) How sensitive are the accuracy measures to the amount of training data used?
set.seed(87654321)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
fit <- myseries_train |>
model(SNAIVE(Turnover))
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
fitacc <- fit |> accuracy()
print(fitacc)
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Footwea… SNAIV… Trai… 5.12 10.2 7.33 5.99 8.82 1 1 0.681
fcacc <- fc |> accuracy(myseries)
print(fcacc)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Vict… Footwea… Test -1.47 20.4 16.0 -2.37 9.54 2.19 1.99 0.646
Residuals seem to be correlated the residuals have a right skewed distribution. there is a high correclation at low lag values. the actual data goes well outside the limits of levels of the forecast. The mean absolute percent error of the forecast is 9.535764 and the root mean square error is higher for the forcast than it is for the model fitting to the training data. the forecast has a MASE greater than 1 which indicates it is not better than the naive model.
set.seed(87654321)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train2 <- myseries |>
filter(year(Month) < 2016)
autoplot(myseries, Turnover) +
autolayer(myseries_train2, Turnover, colour = "red")
fit <- myseries_train2 |>
model(SNAIVE(Turnover))
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
fc2 <- fit |>
forecast(new_data = anti_join(myseries, myseries_train2))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
fitacc2 <- fit |> accuracy()
print(fitacc2)
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Footwea… SNAIV… Trai… 4.04 11.8 8.52 4.79 8.91 1 1 0.731
fcacc2 <- fc2 |> accuracy(myseries)
print(fcacc2)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Vict… Footwea… Test 12.1 18.3 13.3 6.16 7.07 1.57 1.55 0.400
Increasing the amount of training data does appear to improve the model error slightly. It is not very sensisitive to changes of a few years.