Do exercises 5.1, 5.2, 5.3, 5.4 and 5.7 in the Hyndman book. Please submit your Rpubs link as well as your .pdf file showing your run code.

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.3.3

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.5
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.3.2
## ✔ lubridate   1.9.3     ✔ fable       0.3.4
## ✔ ggplot2     3.5.1     ✔ fabletools  0.4.2

## Warning: package 'tidyr' was built under R version 4.3.3

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'tsibble' was built under R version 4.3.3

## Warning: package 'tsibbledata' was built under R version 4.3.3

## Warning: package 'feasts' was built under R version 4.3.3

## Warning: package 'fabletools' was built under R version 4.3.3

## Warning: package 'fable' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ readr   2.1.5
## ✔ purrr   1.0.2     ✔ stringr 1.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(latex2exp)

## Warning: package 'latex2exp' was built under R version 4.3.3

library(seasonal)

## Warning: package 'seasonal' was built under R version 4.3.3

## 
## Attaching package: 'seasonal'
## 
## The following object is masked from 'package:tibble':
## 
##     view

library(seasonalview)

## Warning: package 'seasonalview' was built under R version 4.3.3

## 
## Attaching package: 'seasonalview'
## 
## The following object is masked from 'package:seasonal':
## 
##     view
## 
## The following object is masked from 'package:tibble':
## 
##     view

#loaded relevant libraries

5.1) Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy) Bricks (aus_production) NSW Lambs (aus_livestock) Household wealth (hh_budget). Australian takeaway food turnover (aus_retail).

#first visuallizing the data
head(global_economy)

## # A tsibble: 6 x 9 [1Y]
## # Key:       Country [1]
##   Country     Code   Year         GDP Growth   CPI Imports Exports Population
##   <fct>       <fct> <dbl>       <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
## 1 Afghanistan AFG    1960  537777811.     NA    NA    7.02    4.13    8996351
## 2 Afghanistan AFG    1961  548888896.     NA    NA    8.10    4.45    9166764
## 3 Afghanistan AFG    1962  546666678.     NA    NA    9.35    4.88    9345868
## 4 Afghanistan AFG    1963  751111191.     NA    NA   16.9     9.17    9533954
## 5 Afghanistan AFG    1964  800000044.     NA    NA   18.1     8.89    9731361
## 6 Afghanistan AFG    1965 1006666638.     NA    NA   21.4    11.3     9938414

global_economy |>
  filter(Country == "Australia") |>  # selecting australia from global economy
  autoplot(Population) +
  labs(y = "Population", title = "Australian Population")

# data appears to be relatively linear.  will apply a drift forecast
Aus_pop <- global_economy |>
  filter(Country == "Australia") |>  # selecting australia from global economy
select(c("Country", "Year","Population")) #selecting population data and time
Aus_pop_training <- Aus_pop |> 
  filter_index("1960"~"2016") #selecting a subset of the data to train with. selected most of the data minus one year so that the forecast and the original graph connect
aus_pop_drift <- Aus_pop_training |> model(RW(Population ~ drift())) #take the training data and train the drift model
aus_pop_forecast <- aus_pop_drift |> 
  forecast(h = "20 years") #make the forecast model go out to 20 years.
  Aus_pop |>   autoplot(Population) +
    aus_pop_forecast |> autolayer(filter_index(Aus_pop, "2017"~ .)) #graph the forecast from 2017.

  labs(y = "Population", title = "Australia population")

## $y
## [1] "Population"
## 
## $title
## [1] "Australia population"
## 
## attr(,"class")
## [1] "labels"

the forecast predicts the next 20 years. though the lower slope of 1960 until 2006 predominates the predicted slope because more data is at the lower slope. after 2006 the slope increases. showing another forecast just based on the most recent years shows the difference in these slopes

  Aus_pop_training_short <- Aus_pop |> 
  filter_index("2006"~"2016") #selecting a subset of the data to train with only based on the most recent years. 
aus_pop_drift_short <- Aus_pop_training_short |> model(RW(Population ~ drift())) #take the shorter training data and train the drift model
aus_pop_forecast_short <- aus_pop_drift_short |> 
  forecast(h = "20 years") #make the forecast model go out to 20 years.
  Aus_pop |>   autoplot(Population) +
    aus_pop_forecast_short |> autolayer(filter_index(Aus_pop, "2017"~ .)) #graph the forecast from 2017.

  labs(y = "Population", title = "Australia population")

## $y
## [1] "Population"
## 
## $title
## [1] "Australia population"
## 
## attr(,"class")
## [1] "labels"

the slope dramatically increases based on the slope of the more recent years training data.

#first visuallizing the data
head(aus_production)

## # A tsibble: 6 x 7 [1Q]
##   Quarter  Beer Tobacco Bricks Cement Electricity   Gas
##     <qtr> <dbl>   <dbl>  <dbl>  <dbl>       <dbl> <dbl>
## 1 1956 Q1   284    5225    189    465        3923     5
## 2 1956 Q2   213    5178    204    532        4436     6
## 3 1956 Q3   227    5297    208    561        4806     7
## 4 1956 Q4   308    5681    197    570        4418     6
## 5 1957 Q1   262    5577    187    529        4339     5
## 6 1957 Q2   228    5651    214    604        4811     7

aus_production |>
  select("Quarter", "Bricks") |>  # selecting Bricks produced per quarter
  autoplot(Bricks) +
  labs(y = "Bricks", title = "Australian Bricks")

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

There is a clear seasonality to the data focusing on just the drift will not provide a good prediction. a snaive should provide some decent predictions

bricks <- aus_production |>  select("Quarter", "Bricks") |> filter(!is.na(Bricks)) #select the bricks data remove the last few years with na values.
bricks_train <- bricks |>  filter_index(. ~"2001 Q1") #cutting off the last few years to see how the model may predict the last 5 years
bricks_model <- bricks_train |> model(SNAIVE(Bricks ~ lag("5 years"))) #forecast based on the past 5 years
 
bricks_predict <- bricks_model |> 
  forecast(h = "5 years") #make the forecast model go out to 5 years.
  bricks |>   autoplot(Bricks) +
    bricks_predict |> autolayer(filter_index(bricks), alpha = 0.5) + #graph the forecast from 2000. alpha was added to make it more transparent
  labs(y = "Bricks", title = "Australia Bricks")

The forecast follows the seasonality of the data quite well. There is a large error prediction associated with it due the the fluctuations in the data. overall it follows the trend of the last 5 years specified. Esentially it is a copy and paste of the last five years from the training data set.

##NSW Lambs (aus_livestock)

#first visuallizing the data
head(aus_livestock)

## # A tsibble: 6 x 4 [1M]
## # Key:       Animal, State [1]
##      Month Animal                     State                        Count
##      <mth> <fct>                      <fct>                        <dbl>
## 1 1976 Jul Bulls, bullocks and steers Australian Capital Territory  2300
## 2 1976 Aug Bulls, bullocks and steers Australian Capital Territory  2100
## 3 1976 Sep Bulls, bullocks and steers Australian Capital Territory  2100
## 4 1976 Oct Bulls, bullocks and steers Australian Capital Territory  1900
## 5 1976 Nov Bulls, bullocks and steers Australian Capital Territory  2100
## 6 1976 Dec Bulls, bullocks and steers Australian Capital Territory  1800

unique(aus_livestock$State) #assuming "New South Wales" is NSW

## [1] Australian Capital Territory New South Wales             
## [3] Northern Territory           Queensland                  
## [5] South Australia              Tasmania                    
## [7] Victoria                     Western Australia           
## 8 Levels: Australian Capital Territory New South Wales ... Western Australia

unique(aus_livestock$Animal) #syntax for Lambs is "Lambs"

## [1] Bulls, bullocks and steers Calves                    
## [3] Cattle (excl. calves)      Cows and heifers          
## [5] Lambs                      Pigs                      
## [7] Sheep                     
## 7 Levels: Bulls, bullocks and steers Calves ... Sheep

NSW_lambs <- aus_livestock |>
  filter(Animal == "Lambs", State == "New South Wales")   # selecting Bricks produced per quarter
 NSW_lambs |>  autoplot(Count) +
  labs(y = "Lambs", title = "NSW Lambs")

upon first glance there does not seem to be a clear pattern to the data. I believe naive should be the best approach especially since the drift slope is not very high.

lambs_naive <- NSW_lambs |> filter_index(. ~"2015 Jan") |> model(NAIVE(Count)) #forecast based on the past 
 
lamb_forecast <- lambs_naive |> 
  forecast(h = "5 years") #make the forecast model go out to 5 years.
  NSW_lambs |>   autoplot(Count) +
    lamb_forecast |> autolayer(filter_index(Count), alpha = 0.4) + #graph the forecast from 2015.
  labs(y = "Lambs", title = "NSW lamb")

even with the large level assoicated with tit overall it follows the data well especially at the scale shown.

##Household wealth (hh_budget).

head(hh_budget)

## # A tsibble: 6 x 8 [1Y]
## # Key:       Country [1]
##   Country    Year  Debt    DI Expenditure Savings Wealth Unemployment
##   <chr>     <dbl> <dbl> <dbl>       <dbl>   <dbl>  <dbl>        <dbl>
## 1 Australia  1995  95.7  3.72        3.40   5.24    315.         8.47
## 2 Australia  1996  99.5  3.98        2.97   6.47    315.         8.51
## 3 Australia  1997 108.   2.52        4.95   3.74    323.         8.36
## 4 Australia  1998 115.   4.02        5.73   1.29    339.         7.68
## 5 Australia  1999 121.   3.84        4.26   0.638   354.         6.87
## 6 Australia  2000 126.   3.77        3.18   1.99    350.         6.29

household_wealth <- hh_budget |>filter(Country=="Australia") |>  select(Year, Wealth)
#selecting the australian household wealth
household_wealth |>  autoplot(Wealth) +
  labs(y = "wealth", title = "Aus household wealth")

#attempting a seasonal to see how well it predicts
wealth_train <- household_wealth |>  filter_index(. ~"2006") #attempting to see how well the first 11 years predicts the next 11 years
wealth_model <- wealth_train |> model(SNAIVE(Wealth ~ lag("11 years"))) #forecast based on the past 11 years
 
wealth_predict <- wealth_model |> 
  forecast(h = "11 years") #make the forecast model go out to 11 years.
  household_wealth |>   autoplot(Wealth) +
  wealth_predict |> autolayer(filter_index(Wealth), alpha = 0.4) + #graph the forecast from the year the training data ends (2006). alpha was added to make it more transparent
  labs(y = "Wealth", title = "Australia wealth")

Taking an 11 year seasonal approach yielded some interesting results. it shows that the previous 11 years was not a bad predictor of the last 11 years.

##Australian takeaway food turnover (aus_retail).

head(aus_retail)

## # A tsibble: 6 x 5 [1M]
## # Key:       State, Industry [1]
##   State                        Industry            `Series ID`    Month Turnover
##   <chr>                        <chr>               <chr>          <mth>    <dbl>
## 1 Australian Capital Territory Cafes, restaurants… A3349849A   1982 Apr      4.4
## 2 Australian Capital Territory Cafes, restaurants… A3349849A   1982 May      3.4
## 3 Australian Capital Territory Cafes, restaurants… A3349849A   1982 Jun      3.6
## 4 Australian Capital Territory Cafes, restaurants… A3349849A   1982 Jul      4  
## 5 Australian Capital Territory Cafes, restaurants… A3349849A   1982 Aug      3.6
## 6 Australian Capital Territory Cafes, restaurants… A3349849A   1982 Sep      4.2

unique(aus_retail$Industry) #"Takeaway food services"

##  [1] "Cafes, restaurants and catering services"                         
##  [2] "Cafes, restaurants and takeaway food services"                    
##  [3] "Clothing retailing"                                               
##  [4] "Clothing, footwear and personal accessory retailing"              
##  [5] "Department stores"                                                
##  [6] "Electrical and electronic goods retailing"                        
##  [7] "Food retailing"                                                   
##  [8] "Footwear and other personal accessory retailing"                  
##  [9] "Furniture, floor coverings, houseware and textile goods retailing"
## [10] "Hardware, building and garden supplies retailing"                 
## [11] "Household goods retailing"                                        
## [12] "Liquor retailing"                                                 
## [13] "Newspaper and book retailing"                                     
## [14] "Other recreational goods retailing"                               
## [15] "Other retailing"                                                  
## [16] "Other retailing n.e.c."                                           
## [17] "Other specialised food retailing"                                 
## [18] "Pharmaceutical, cosmetic and toiletry goods retailing"            
## [19] "Supermarket and grocery stores"                                   
## [20] "Takeaway food services"

unique(aus_retail$State)

## [1] "Australian Capital Territory" "New South Wales"             
## [3] "Northern Territory"           "Queensland"                  
## [5] "South Australia"              "Tasmania"                    
## [7] "Victoria"                     "Western Australia"

#not sure which state it wants assuming New South Wales like in the lamb example
takeaway_turn <- aus_retail |>filter(Industry=="Takeaway food services", State == "New South Wales") |>  select(Month, Turnover)
#selecting the australian takeaway turnover
takeaway_turn |>  autoplot(Turnover) +
  labs(y = "Turnover", title = "Takeaway food turnover")

#check drift and seasonal

  turn_train <- takeaway_turn |> 
  filter_index("2000 Jan"~"2011 Jan") #selecting a subset of the data to train with only based on the previous decade or so. 
turn_model <- turn_train |> model(RW(Turnover ~ drift())) #take the 11 years of training data and train the drift model
turn_forecast <- turn_model |> 
  forecast(h = "10 years") #make the forecast model go out to 10 years.
  takeaway_turn |>   autoplot(Turnover) +
    turn_forecast |> autolayer(filter_index(Turnover), alpha = 0.4) + #graph the forecast
  labs(y = "Turnover", title = "takeaway turnover NSW")

well the actual data is well within the predicted levels of the drift model.

5.2) Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series. Produce forecasts using the drift method and plot them. Show that the forecasts are identical to extending the line drawn between the first and last observations. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

head(gafa_stock)

## # A tsibble: 6 x 8 [!]
## # Key:       Symbol [1]
##   Symbol Date        Open  High   Low Close Adj_Close    Volume
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>     <dbl>
## 1 AAPL   2014-01-02  79.4  79.6  78.9  79.0      67.0  58671200
## 2 AAPL   2014-01-03  79.0  79.1  77.2  77.3      65.5  98116900
## 3 AAPL   2014-01-06  76.8  78.1  76.2  77.7      65.9 103152700
## 4 AAPL   2014-01-07  77.8  78.0  76.8  77.1      65.4  79302300
## 5 AAPL   2014-01-08  77.0  77.9  77.0  77.6      65.8  64632400
## 6 AAPL   2014-01-09  78.1  78.1  76.5  76.6      65.0  69787200

unique(gafa_stock$Symbol) # assuming FB is facebook

## [1] "AAPL" "AMZN" "FB"   "GOOG"

FB_stock <- gafa_stock |> filter(Symbol == "FB") |> select(Date, Open) # chose open for the stock prices.
#FB_stock$Date <- ymd(FB_stock$Date)
FB_stock |> autoplot(Open) +
  labs(y="Open Stock price", title = "FB Stock price")

 FB_stock <- FB_stock |> update_tsibble(regular = TRUE)
FB_stock <- FB_stock |> fill_gaps() # fill gaps and regular = True required to avoid the model from erroring out.
FB_model <- FB_stock |> model(RW(Open ~ drift())) # all the data was used as training for the drift model
#FB_model <- FB_model |> update_tsibble(regular = TRUE)
FB_forecast <- FB_model |> 
  forecast(h = "2 years") #make the forecast model go out to 2 years.
  FB_stock |>   autoplot(Open) +
    FB_forecast |> autolayer(filter_index(Open)) + #graph the forecast
  labs(y = "Open Stock price", title = "FB stock price drift")

Model is added to the graph

Show that the forecasts are identical to extending the line drawn between the first and last observations.

   first <- FB_stock |> slice(1)    #picking the fist point in the data by row number
last <- FB_stock |> slice(n())     #picking the last point in the data by row number
#putting the first and last point in a dataframe so that a line can be drawn between them later.  keeping the column names the same
firstlast <- data.frame(        
  Date = c(first$Date, last$Date) ,
  Open = c(first$Open, last$Open )
  )

#graphing the FB stock price
  FB_stock |>   autoplot(Open) +
        #graph the line through the first and last point
   geom_line(data = firstlast, aes( data=firstlast, x = Date, y = Open), color = "blue", linetype = "dashed", size = 2 ) +  
     FB_forecast |> autolayer(filter_index(Open)) + #graph the forecast
  labs(y = "Open", title = "FB Stock Price")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning in geom_line(data = firstlast, aes(data = firstlast, x = Date, y =
## Open), : Ignoring unknown aesthetics: data

## Don't know how to automatically pick scale for object of type <data.frame>.
## Defaulting to continuous.

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

#simple benchmark mean takes the mean of the data
FB_mean_model <- FB_stock |> model(MEAN(Open))
FB_meanforecast <- FB_mean_model |> 
  forecast(h = "2 years")
FB_stock |>   autoplot(Open) +
    FB_meanforecast |> autolayer(filter_index(Open)) + #graph the forecast
  labs(y = "Open Stock price", title = "FB stock price mean")

#simple benchmark naive takes last point.
FB_naive_model <- FB_stock |> model(NAIVE(Open))
FB_naiveforecast <- FB_naive_model |> 
  forecast(h = "2 years")
FB_stock |>   autoplot(Open) +
    FB_naiveforecast |> autolayer(filter_index(Open)) + #graph the forecast
  labs(y = "Open Stock price", title = "FB stock price naive")

I think the naive out of the simple benchmarks is best due to being close to the last stock price and allowing fluctuations in the price as long there is not a signficant drift. The drift is not consistant due to the rising and falling of the stock price.

##5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

Extract data of interest

recent_production <- aus_production |> filter(year(Quarter) >= 1992) # Define and estimate a model fit <- recent_production |> model(SNAIVE(Beer)) # Look at the residuals fit |> gg_tsresiduals() # Look a some forecasts fit |> forecast() |> autoplot(recent_production) What do you conclude?

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

#take the model for pvalue residual analysis Portmanteau tests for autocorrelation
aug1 <- fit |> 
  augment()

pvalue1 <- aug1 |> features(.resid, ljung_box, lag = 12) #3 x 4 quarters


print(pvalue1)

## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    38.7  0.000116

There is a couple of residuals outside the bounds of the acf graph which indicates there may be some autocorrelation meaning the residuals are outside the bounds of white noise. the distribution is not perfectly normal. applying a ljung_box Portmanteau test gives us a p value less than 0.05 which means we can reject the null hypothesis of the residuals being consistant with white noise.

##5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

bricks_model2 <- bricks |> model(SNAIVE(Bricks ~ lag("5 years"))) #forecast based on the past 5 years
bricks_model2 |> gg_tsresiduals()

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 20 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
bricks_model2 |> forecast() |> autoplot(bricks)

#take the model for pvalue residual analysis Portmanteau tests for autocorrelation
aug2 <- bricks_model2 |> 
  augment()

pvalue2 <- aug2 |> features(.resid, ljung_box, lag = 60) 


print(pvalue2)

## # A tibble: 1 × 3
##   .model                              lb_stat lb_pvalue
##   <chr>                                 <dbl>     <dbl>
## 1 "SNAIVE(Bricks ~ lag(\"5 years\"))"   1205.         0

For the bricks model there is a large amount of autocorrelation. the p value is “0”. this is not a good model.

head(global_economy)

## # A tsibble: 6 x 9 [1Y]
## # Key:       Country [1]
##   Country     Code   Year         GDP Growth   CPI Imports Exports Population
##   <fct>       <fct> <dbl>       <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
## 1 Afghanistan AFG    1960  537777811.     NA    NA    7.02    4.13    8996351
## 2 Afghanistan AFG    1961  548888896.     NA    NA    8.10    4.45    9166764
## 3 Afghanistan AFG    1962  546666678.     NA    NA    9.35    4.88    9345868
## 4 Afghanistan AFG    1963  751111191.     NA    NA   16.9     9.17    9533954
## 5 Afghanistan AFG    1964  800000044.     NA    NA   18.1     8.89    9731361
## 6 Afghanistan AFG    1965 1006666638.     NA    NA   21.4    11.3     9938414

aus_ex <- global_economy |> filter(Country == "Australia") |> select(Year, Exports)
aus_ex |> autoplot(Exports)

#trying a naive

aus_ex_model <- aus_ex |> model(NAIVE(Exports))
aus_ex_model |> forecast() |> autoplot(aus_ex)

aus_ex_model |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

#take the model for pvalue residual analysis Portmanteau tests for autocorrelation
aug3 <- aus_ex_model |> 
  augment()

pvalue3 <- aug3 |> features(.resid, ljung_box, lag = 10) 


print(pvalue3)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 NAIVE(Exports)    16.4    0.0896

For the naive the distrubtion of the residuals is normal. there is one residual outside the autocorrelation function. however the p value is high at 0.09 for a lag of 10. The naive appears to be a good fit due to the residuals appearing to be like white noise.

##5.7

For your retail time series (from Exercise 7 in Section 2.10):

Create a training dataset consisting of observations before 2011 using

myseries_train <- myseries |> filter(year(Month) < 2011) Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) + autolayer(myseries_train, Turnover, colour = “red”) Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |> model(SNAIVE()) Check the residuals.

fit |> gg_tsresiduals() Do the residuals appear to be uncorrelated and normally distributed?

Produce forecasts for the test data

fc <- fit |> forecast(new_data = anti_join(myseries, myseries_train)) fc |> autoplot(myseries) Compare the accuracy of your forecasts against the actual values.

fit |> accuracy() fc |> accuracy(myseries) How sensitive are the accuracy measures to the amount of training data used?

set.seed(87654321)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train <- myseries |>
  filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

fit <- myseries_train |>
  model(SNAIVE(Turnover))

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

fitacc <- fit |> accuracy()
print(fitacc)

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Footwea… SNAIV… Trai…  5.12  10.2  7.33  5.99  8.82     1     1 0.681

fcacc <- fc |> accuracy(myseries)
print(fcacc)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Vict… Footwea… Test  -1.47  20.4  16.0 -2.37  9.54  2.19  1.99 0.646

Residuals seem to be correlated the residuals have a right skewed distribution. there is a high correclation at low lag values. the actual data goes well outside the limits of levels of the forecast. The mean absolute percent error of the forecast is 9.535764 and the root mean square error is higher for the forcast than it is for the model fitting to the training data. the forecast has a MASE greater than 1 which indicates it is not better than the naive model.

set.seed(87654321)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train2 <- myseries |>
  filter(year(Month) < 2016)
autoplot(myseries, Turnover) +
  autolayer(myseries_train2, Turnover, colour = "red")

fit <- myseries_train2 |>
  model(SNAIVE(Turnover))

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

fc2 <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train2))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

fitacc2 <- fit |> accuracy()
print(fitacc2)

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Footwea… SNAIV… Trai…  4.04  11.8  8.52  4.79  8.91     1     1 0.731

fcacc2 <- fc2 |> accuracy(myseries)
print(fcacc2)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Vict… Footwea… Test   12.1  18.3  13.3  6.16  7.07  1.57  1.55 0.400

Increasing the amount of training data does appear to improve the model error slightly. It is not very sensisitive to changes of a few years.

Data 624 Assignment 3 KDD

Keith DeNivo

2024-10-01

Extract data of interest