DATA 624 - Homework 5

8.1 Consider the the number of pigs slaughtered in Victoria, available in the aus_livestock dataset.

Use the ETS() function to estimate the equivalent model for simple exponential smoothing. Find the optimal values of α and ℓ0 , and generate forecasts for the next four months.

 knitr::opts_chunk$set(warning = FALSE, message = FALSE)  
# package was not installed i have installed below package
#install.packages("tidyverse")
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(fpp3)

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tsibble     1.1.6     ✔ feasts      0.4.1
## ✔ tsibbledata 0.4.1     ✔ fable       0.4.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

df <- aus_livestock %>%
  filter(Animal == 'Pigs' & State == 'Victoria')

df %>%
  autoplot(Count) +
  labs(
    title = 'Pigs in Victoria'
  )

Pigs<-aus_livestock %>%
  filter(State=="Victoria" & Animal=="Pigs")


fit <- as_tsibble(Pigs) %>%
  model(ETS(Count ~ error("A") + trend("N") + season("N")))

#fit1<-ses(Pigs,h=4)

fc<-fit %>%
  forecast(h = 4)

print(tidy(fit))

## # A tibble: 2 × 5
##   Animal State    .model                                          term  estimate
##   <fct>  <fct>    <chr>                                           <chr>    <dbl>
## 1 Pigs   Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + sea… alpha  3.22e-1
## 2 Pigs   Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + sea… l[0]   1.01e+5

DISCUSSION:

Parameters

alpha=.322 l0=1.01e+5

fc %>%
  autoplot(Pigs) +
  geom_line(aes(y = .fitted), col="green",
            data = augment(fit)) +
  labs(y="Count", title="Victoria  Pigs") +
  guides(colour = "none")

DISCUSSION: Black line represents data, blue line is one step ahead.

b.Compute a 95% prediction interval for the first forecast using ^y±1.96s where s is the standard deviation of the residuals. Compare your interval with the interval produced by R.

h = unpack_hilo(hilo(fc, 95) , "95%" )

View(h)
h[1,]

## # A tsibble: 1 x 8 [1M]
## # Key:       Animal, State, .model [1]
##   Animal State    .model                                                   Month
##   <fct>  <fct>    <chr>                                                    <mth>
## 1 Pigs   Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"… 2019 Jan
## # ℹ 4 more variables: Count <dist>, .mean <dbl>, `95%_lower` <dbl>,
## #   `95%_upper` <dbl>

h$Count

## <distribution[4]>
## [1] N(95187, 8.7e+07) N(95187, 9.7e+07) N(95187, 1.1e+08) N(95187, 1.1e+08)

R_sd <- sqrt(distributional::variance(fc$Count[1]))
upper <- round(fc$.mean[1] + 1.96 * R_sd, 2)
lower <- round(fc$.mean[1] - 1.96 * R_sd, 2)

print_upper <- paste('Upper bound of first forecasted value =', upper)
print_lower <- paste('Lower bound of first forecasted value =', lower)

cat(print_upper, '\n', print_lower, sep='')

## Upper bound of first forecasted value = 113518.66
## Lower bound of first forecasted value = 76854.45

So the r generated interval is: [76855,113518]

Now, manually calculate.

# mean of first forecast is 95186.56
(m<-95186.56)

## [1] 95186.56

#get sigma
report(fit)

## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.3221247 
## 
##   Initial states:
##      l[0]
##  100646.6
## 
##   sigma^2:  87480760
## 
##      AIC     AICc      BIC 
## 13737.10 13737.14 13750.07

#sigma^2:  87480760
(sigma<-84780760^.5)

## [1] 9207.647

#Lower bound
m-(1.96*sigma)

## [1] 77139.57

#upper bound
m+(1.96*sigma)

## [1] 113233.5

DISCUSSION: r generated 95% CI: [76855,113518] manual 95% CI: [77140,113234]

The two intervals are reasonably close. The manual CI bounds are closer together.

8.5 Data set global_economy contains the annual Exports from many countries. Select one country to analyse.

a. Plot the Exports series and discuss the main features of the data.

I have chosen USA for the purpose of this question. The relevant data is pulled

data("global_economy")

USA_Exports <- global_economy %>%
  filter(Code == 'USA' & !is.na(Exports))

USA_Exports %>%
  autoplot(Exports) + 
  labs(
    title = 'Exports of USA as % of GDP ',    y = '% of GDP'  )

Plotting the data overall upwards trend:

b. Use an ETS(A,N,N) model to forecast the series, and plot the forecasts.

USA_Exports <- na.omit(USA_Exports)

usa_fit <- USA_Exports %>%
  model(ETS(Exports ~ error("A") + trend("N") + season("N")))


usa_fc <- usa_fit %>%
  forecast(h = 5)

usa_fc %>%
  autoplot(USA_Exports) +
  geom_line(aes(y = .fitted), col="green",
            data = augment(usa_fit)) +
  labs(y="% of GDP", title="Exports: United States") +
  guides(colour = "none")

c. Compute the RMSE values for the training data.

accuracy(usa_fit)

## # A tibble: 1 × 11
##   Country       .model     .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <fct>         <chr>      <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 United States "ETS(Expo… Trai… 0.125 0.632 0.468  1.35  5.09 0.982 0.991 0.239

d. Compare the results to those from an ETS(A,A,N) model. (Remember that the trended model is using one more parameter than the simpler model.) Discuss the merits of the two forecasting methods for this data set.

usa_RMSE_ETS<-accuracy(USA_Exports %>%
                         model(
                           ANN = ETS (Exports ~ error("A") + trend("N") + season("N")),
                           AAN = ETS (Exports ~ error("A") + trend("A") + season("N"))
                         ))[["RMSE"]]
usa_RMSE_train<-accuracy(usa_fit)[["RMSE"]]

paste0("Training Model RMSE is ",usa_RMSE_train)

## [1] "Training Model RMSE is 0.631987696877015"

paste0("Compared to ")

## [1] "Compared to "

paste0(usa_RMSE_ETS)

## [1] "0.631987696877015" "0.620934771848593"

With the AAN model providing a smaller RMSE by about 0.011 than ANN, it suggests a better model lies with AAN.

Compare the forecasts from both methods. Which do you think is best?

USA_Exports %>%
  model(
    ANN = ETS (Exports ~ error("A") + trend("N") + season("N")),
    AAN = ETS (Exports ~ error("A") + trend("A") + season("N"))
  ) %>% 
  forecast(h=4) %>% 
  autoplot(USA_Exports, level=NULL) +
  labs(title="US Export Predictions")

Calculate a 95% prediction interval for the first forecast for each model, using the RMSE values and assuming normal errors. Compare your intervals with those produced using R.

usa_yhat <- usa_fc$.mean[1]


usa_aug <- augment(usa_fit)


usa_sd <- sd(usa_aug$.resid)


usa_upper95 <- usa_yhat + (usa_sd * 1.96)
usa_lower95 <- usa_yhat - (usa_sd * 1.96)


usa_hilo <- usa_fc %>% hilo()

paste0("Lower 95% ",usa_lower95," Mean ",usa_yhat, " Upper 95% ",usa_upper95)

## [1] "Lower 95% 10.6654088104703 Mean 11.8906832823187 Upper 95% 13.115957754167"

paste0("While our forecast had the values", usa_hilo$`95%`[1],
       " with a mean of ", usa_hilo$.mean[1] )

## [1] "While our forecast had the values[10.6292803146377, 13.1520862499996]95 with a mean of 11.8906832823187"

Both are accurate up to the first decimal. The method using R vs the manual, accounts for degrees of freedom and has a more precise value for the critical values, but also does gave a greater range.

8.6 Forecast the Chinese GDP from the global_economy data set using an ETS model. Experiment with the various options in the ETS() function to see how much the forecasts change with damped trend, or with a Box-Cox transformation. Try to develop an intuition of what each is doing to the forecasts.

# package was not installed i have installed below package
#install.packages("cowplot")
library(cowplot)
china_df <- global_economy %>%
  filter(Code == 'CHN')

china_plot1<-china_df %>% autoplot(GDP) +
  labs(title="China GDP")

china_lambda <- china_df %>%
  features(GDP, features = guerrero) %>%
  pull(lambda_guerrero)

fit_china <- china_df %>% 
  model(
    # Find ETS
    ETS = ETS(GDP),
    # Transformation Log 
    `Log` = ETS(log(GDP)),
    # Damped Model
    `Damped` = ETS(GDP ~ trend("Ad")),
    # Box-Cox Transformation
    `Box-Cox` = ETS(box_cox(GDP, china_lambda)),
    # Damped Model with Box-Cox Transformation
    `Box-Cox, Damped` = ETS(box_cox(GDP, china_lambda) ~ trend("Ad"))
  )

china_plot2<-fit_china %>%
  forecast(h="20 years") %>%
  autoplot(china_df, level = NULL)+
  labs(title="Chinese GDP Forecast") +
  guides(colour = guide_legend(title = "Forecast"))

plot_grid(china_plot1,
          china_plot2, nrow = 2)

Damped and ETS show similar growth, while Log and Box-Cox seem to exaggerate its forecast. Box-Cox, Damped shows slightly more growth than ETS and Damped.

8.7 Find an ETS model for the Gas data from aus_production and forecast the next few years. Why is multiplicative seasonality necessary here? Experiment with making the trend damped. Does it improve the forecasts?

Seasonal variation makes multiplicative seasonality necessary.

#install.packages("cowplot")
library(cowplot)
gas_plot1<-aus_production %>% autoplot(Gas)+
  labs(title="Austrailian Gas Production")

gas_fit <- aus_production %>%
  model(
    # Multiplicative
    Multiplicative = ETS(Gas ~ error("M") + trend("A") + season("M")),
    # Damped multiplicative
    `Multiplicative, Damped` = ETS(Gas ~ error("M") + trend("Ad") + season("M"))
  )
gas_fc <- gas_fit %>% forecast(h = "5 years")

gas_plot2<-gas_fc %>%
  autoplot(aus_production, level = NULL) +
  labs(title="Australian Gas Production Forecast") +
  guides(colour = guide_legend(title = "Forecast"))

plot_grid(gas_plot1,
          gas_plot2, nrow = 2)

Very little difference between the two models. About 3.099 difference, meaning either would be accurate or optimal

8.8 Recall your retail time series data (from Exercise 7 in Section 2.10).

Why is multiplicative seasonality necessary for this series?

Becuase there is clear seasonality and peaks on January

b. Apply Holt-Winters’ multiplicative method to the data. Experiment with making the trend damped.

set.seed(123)

myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myfit <- myseries %>%
  model(
    `Holt-Winters’ Multiplicative` = ETS(Turnover ~ error("M") + trend("A") +
                                           season("M")),
    `Holt-Winters’ Damped Multiplicative` = ETS(Turnover ~ error("M") + trend("Ad") +
                                                  season("M"))
  )


myfc <- myfit %>% forecast(h = "5 years")
myfc %>%
  autoplot(myseries, level = NULL) +
  labs(title="Australian Department Stores",
       y="Turnover") +
  guides(colour = guide_legend(title = "Forecast"))

c. Compare the RMSE of the one-step forecasts from the two methods. Which do you prefer?

accuracy(myfit)%>%
  dplyr::select(".model","RMSE")

## # A tibble: 2 × 2
##   .model                               RMSE
##   <chr>                               <dbl>
## 1 Holt-Winters’ Multiplicative         24.3
## 2 Holt-Winters’ Damped Multiplicative  23.7

d. Check that the residuals from the best method look like white noise.

myfit%>%
  dplyr::select("Holt-Winters’ Damped Multiplicative")%>%gg_tsresiduals()

Holt-Winters’ Damped Multiplicative is not white noise base on the plot, which has over 5% of the spikes of the bounds made by dashed lines.

e. Now find the test set RMSE, while training the model to the end of 2010. Can you beat the seasonal naïve
approach from Exercise 7 in Section 5.11?

mytrain<-myseries%>%
  filter(Month >= yearmonth('2011 Jan'))

# seasonal naïve
myfit2 <- mytrain %>%
  model(
    "Holt-Winters' Damped" = ETS(Turnover ~ error("M") + trend("Ad") +
                                   season("M")),
    "Holt-Winters' Multiplicative" = ETS(Turnover ~ error("M") + trend("A") +
                                           season("M")),
    "Seasonal Naïve Forecast" = SNAIVE(Turnover)
  )


comparison <- anti_join(myseries, mytrain, 
                        by = c("State", "Industry", "Series ID", "Month", "Turnover"))

# forecasting according to comparison data
myfc2 <- myfit2 %>%
  forecast(comparison)

# plot
autoplot(comparison, Turnover) +
  autolayer(myfc2, level = NULL) +
  guides(colour=guide_legend(title="Forecast")) +
  ggtitle('Forecast Comparison ')

accuracy(myfit2)

## # A tibble: 3 × 12
##   State    Industry    .model .type     ME  RMSE   MAE     MPE  MAPE  MASE RMSSE
##   <chr>    <chr>       <chr>  <chr>  <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Household … Holt-… Trai…  1.97   26.3  21.1  0.135   2.01 0.393 0.400
## 2 Victoria Household … Holt-… Trai…  0.740  27.0  21.7 -0.0276  2.08 0.405 0.411
## 3 Victoria Household … Seaso… Trai… 47.0    65.7  53.6  4.23    4.90 1     1    
## # ℹ 1 more variable: ACF1 <dbl>

It appears the Damped model is the best performing, base on the RMSE values.

8.9 For the same retail data, try an STL decomposition applied to the Box-Cox transformed series, followed by ETS on the seasonally adjusted data. How does that compare with your best previous forecasts on the test set?

#find lambda
mylambda <- mytrain %>%
  features(Turnover, features = guerrero) %>%
  pull(lambda_guerrero)

#bc transformed data
ts_bc <- mytrain %>%
  mutate(
    bc_turnover = box_cox(Turnover, mylambda)
  )

# bc transformed model
fit <- ts_bc %>%
  model(
    'Box-Cox STL' = STL(bc_turnover ~ season(window = "periodic"),
                        robust = T),
    'Box-Cox ETS' = ETS(bc_turnover)
  )

# best previous model 
best_fit <-ts_bc %>%
  model(
    "Holt-Winters' Damped" = ETS(Turnover ~ error("M") + trend("Ad") +
                                                    season("M"))
  )

rbind(accuracy(fit),accuracy(best_fit))

## # A tibble: 3 × 12
##   State   Industry .model .type       ME   RMSE    MAE     MPE  MAPE  MASE RMSSE
##   <chr>   <chr>    <chr>  <chr>    <dbl>  <dbl>  <dbl>   <dbl> <dbl> <dbl> <dbl>
## 1 Victor… Househo… Box-C… Trai…  0.0482   0.417  0.285  0.126  0.702 0.310 0.370
## 2 Victor… Househo… Box-C… Trai… -0.00279  0.433  0.352 -0.0152 0.862 0.383 0.384
## 3 Victor… Househo… Holt-… Trai…  1.97    26.3   21.1    0.135  2.01  0.393 0.400
## # ℹ 1 more variable: ACF1 <dbl>

Based on the Values the Box-Cox ETS is the best performing of the three.

DATA 624 - Homework 5

Bikash Bhowmik

09 Mar 2025