assignment3

Do exercises 5.1, 5.2, 5.3, 5.4 and 5.7 in the Hyndman book. Please submit your Rpubs link as well as your .pdf file showing your run code.

library(feasts)

## Warning: package 'feasts' was built under R version 4.4.1

## Loading required package: fabletools

## Warning: package 'fabletools' was built under R version 4.4.1

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.4.1

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──

## ✔ tibble      3.2.1     ✔ ggplot2     3.5.1
## ✔ dplyr       1.1.4     ✔ tsibble     1.1.5
## ✔ tidyr       1.3.1     ✔ tsibbledata 0.4.1
## ✔ lubridate   1.9.3     ✔ fable       0.3.4

## Warning: package 'ggplot2' was built under R version 4.4.1

## Warning: package 'tsibble' was built under R version 4.4.1

## Warning: package 'tsibbledata' was built under R version 4.4.1

## Warning: package 'fable' was built under R version 4.4.1

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)

aus_pop <-global_economy|>
  filter(Code =="AUS")|>
  select(Country,Year,Population)

fit_pop<- aus_pop |>
  model(RW(Population ~ drift()))
fit_pop

## # A mable: 1 x 2
## # Key:     Country [1]
##   Country   `RW(Population ~ drift())`
##   <fct>                        <model>
## 1 Australia              <RW w/ drift>

fit_pop|> forecast(h = "4 years")|>
  autoplot(aus_pop)

I chose the drift method because we are working with historical data, and I believe this model works well to predict future points for this time series. Bricks (aus_production)

bricks <- aus_production|>
  filter_index("1970 Q1" ~ "2004 Q4")|>
 select(Bricks)

 fit_bricks <- bricks|>
  model(SNAIVE(Bricks))
 
  fc_bricks <- fit_bricks |>
     forecast(h = 10,)
   
  fc_bricks |>
    autoplot(bricks,level = c(25,50))+
  
    labs(
      title = "Brick Production Forcast",
       y = "Bricks Production",
       x = "Quarter") +
    theme_minimal()

For this data set, I chose the SNAIVE method to account for the seasonality observed in the original data.

NSW Lambs (aus_livestock)

be_a_lamb<- aus_livestock|>
  filter(Animal=="Lambs",State == "New South Wales")|>
  filter_index("1987 Dec"~"2018 Dec")|>
  select(Month,Count)|>
  rename(Num_Produced = Count )

fit_lamb<- be_a_lamb |>
model(RW(Num_Produced ~drift() ))
fc_lamb<-fit_lamb |>
  forecast(h="1 years")
fc_lamb|>
  autoplot(be_a_lamb, level = NULL)

Household wealth (hh_budget).

moolah <- hh_budget|>
  select(Year,Wealth)
moolah_fit<- moolah |>
  model(
  NAIVE(Wealth),

  )
fc_moolah<- moolah_fit|>
  forecast( h = 5, label = NULL)

autoplot(fc_moolah, moolah)+
  guides(colour = "none") +  # Remove legend for color
  labs(title = "Wealth Forecast",
       y = "Wealth",
       x = "Year") +
  theme_minimal()

Australian takeaway food turnover (aus_retail).

takeout<- aus_retail|>

  filter(Industry == "Takeaway food services", State =="Australian Capital Territory")|>
  select(Turnover)

takeout_fit<- takeout |>
model(NAIVE(Turnover),
      MEAN(Turnover),
      SNAIVE(Turnover ~ lag("year")))

fc_takeout <- takeout_fit|>
  forecast(h=18)
fc_takeout|>
  autoplot(takeout,level=c(20,50))

I chose the NAIVE method to analyze this economic time series, as I believe it provides an appropriate forecast based on the observations. Additionally, the SNAIVE model is also quite effective, as it attempts to forecast the seasonality present in the data.

5.2

Use the Facebook stock price (data set gafa_stock) to do the following: Produce a time plot of the series. Produce forecasts using the drift method and plot them. Show that the forecasts are identical to extending the line drawn between the first and last observations. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

gafa_stock|>
  filter(Symbol=="FB")|>
autoplot(Close)

fb_closing_price<- gafa_stock |>
  filter(Symbol == "FB")  |>
  mutate(day = row_number()) |>
  update_tsibble(index = day, regular = TRUE)
fb_fit<- fb_closing_price|>
 filter(day >= 500)|>
  model(
    
     RW(Close ~drift()),
    
  )
fb_forecast<- fb_fit|>
  forecast(h=100, label =TRUE) 
fb_forecast|>
 
  autoplot(fb_closing_price, level= c(25,50))+
  #line from first to last
geom_segment(aes(x = 500, y = fb_closing_price$Close[500],# first point
                   xend = nrow(fb_closing_price), # last point
                   yend = fb_closing_price$Close[nrow(fb_closing_price)]),
               color = "green", 
               linetype = "dashed", 
               size = 1)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Use of `fb_closing_price$Close` is discouraged.
## ℹ Use `Close` instead.
## Use of `fb_closing_price$Close` is discouraged.
## ℹ Use `Close` instead.

## Warning in geom_segment(aes(x = 500, y = fb_closing_price$Close[500], xend = nrow(fb_closing_price), : All aesthetics have length 1, but the data has 1258 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

fb_fit2<- fb_closing_price|>
  filter(day >= 500)|>
 
  model(
    
     NAIVE(Close),
     MEAN(Close),
     NAIVE(Close ~ drift())
    
  )
fb_forecast2<- fb_fit2|>
  forecast(h=100, label =NULL) 
fb_forecast2|>
 
  autoplot(fb_closing_price, level = c(25,50))+
  geom_segment(aes(x = 500, y = fb_closing_price$Close[500],# first point
                   xend = nrow(fb_closing_price), # last point
                   yend = fb_closing_price$Close[nrow(fb_closing_price)]),
               color = "green", 
               linetype = "dashed", 
               size = 1)

## Warning: Use of `fb_closing_price$Close` is discouraged.
## ℹ Use `Close` instead.
## Use of `fb_closing_price$Close` is discouraged.
## ℹ Use `Close` instead.

## Warning in geom_segment(aes(x = 500, y = fb_closing_price$Close[500], xend = nrow(fb_closing_price), : All aesthetics have length 1, but the data has 1258 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

I experimented with various forecasting methods for stock prices and found that the drift method was highly sensitive to the starting point of the observations. Therefore, in practical applications, I would prefer using the MEAN method.”

5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992.Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

library(stats)

augment(fit)|>
features(.resid,ljung_box, lag = 2)

## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    4.11     0.128

What do you conclude?

I used a lag of 2, given the seasonality is 1 (2 * 1 = 2). Based on this, I can conclude that the hypothesis of white noise holds, as the residuals are not easily distinguishable from white noise.

5.4

Repeat the previous exercise using the Australian Exports series from global_ economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

# Extract data of interest
recent_exports <- global_economy|>
  filter( Code == "AUS")
autoplot(recent_exports,Exports)

# Define and estimate a model
fit_exp <- recent_exports|> model(NAIVE(Exports ))
# Look at the residuals
fit_exp |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit_exp |> forecast() |> autoplot(recent_exports,level = c(25,50))

augment(fit_exp)|>
features(.resid,ljung_box, lag = 10)

## # A tibble: 1 × 4
##   Country   .model         lb_stat lb_pvalue
##   <fct>     <chr>            <dbl>     <dbl>
## 1 Australia NAIVE(Exports)    16.4    0.0896

This time we went with a lag of 10 since this is not seasonal data, with a lb_pvalue of 0.896 we can conclude that the white noise hypothesis holds.

5.7

For your retail time series (from Exercise 7 in Section 2.10):

Create a training dataset consisting of observations before 2011 using

set.seed(3231)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1)) 

myseries_train <- myseries %>%
  filter(year(Month) < 2011)

myseries_train <- myseries |>
  filter(year(Month) < 2011)

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

fitmy <- myseries_train |>
  model(SNAIVE(Turnover ~ lag("year")))

fitmy |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

The distribution is not even, with several points falling outside the ACF lines, indicating the possibility of meaningful information in the residuals.”

fc <- fitmy |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries,level = NULL)

The model appears to have under-forecasted the test data. I suspect this may be due to unaccounted patterns or information in the residuals.

fitmy |> accuracy()|>
  arrange(.model)|>
  select(.model, .type, RMSE, MAE,MASE, RMSSE)

## # A tibble: 1 × 6
##   .model                             .type     RMSE   MAE  MASE RMSSE
##   <chr>                              <chr>    <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Turnover ~ lag(\"year\"))" Training  7.47  5.81     1     1

fc |> accuracy(myseries)|>
    arrange(.model)|>
  select(.model, .type, RMSE, MAE,MASE, RMSSE)

## # A tibble: 1 × 6
##   .model                             .type  RMSE   MAE  MASE RMSSE
##   <chr>                              <chr> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Turnover ~ lag(\"year\"))" Test   31.3  23.9  4.11  4.19

Smaller training datasets: Accuracy measures can be more volatile and sensitive, especially to outliers or structural patterns not captured by the model.

Larger training datasets: Accuracy measures tend to stabilize and become more representative of the model’s overall performance. The farther away the forcast aims to predict the bigger the chance of inaccuracy.

assignment3

Darwhin Gomez

2024-09-21

Do exercises 5.1, 5.2, 5.3, 5.4 and 5.7 in the Hyndman book. Please submit your Rpubs link as well as your .pdf file showing your run code.

5.1

5.2

5.3

5.4

5.7