library(fpp3)
## Warning: package 'fpp3' was built under R version 4.4.1
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.5
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.3.2
## ✔ lubridate 1.9.3 ✔ fable 0.3.4
## ✔ ggplot2 3.5.1 ✔ fabletools 0.4.2
## Warning: package 'tsibble' was built under R version 4.4.1
## Warning: package 'tsibbledata' was built under R version 4.4.1
## Warning: package 'feasts' was built under R version 4.4.1
## Warning: package 'fabletools' was built under R version 4.4.1
## Warning: package 'fable' was built under R version 4.4.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ purrr 1.0.2 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#call dataset for this question
data(aus_livestock)
#check dataset column values
head(aus_livestock)
## # A tsibble: 6 x 4 [1M]
## # Key: Animal, State [1]
## Month Animal State Count
## <mth> <fct> <fct> <dbl>
## 1 1976 Jul Bulls, bullocks and steers Australian Capital Territory 2300
## 2 1976 Aug Bulls, bullocks and steers Australian Capital Territory 2100
## 3 1976 Sep Bulls, bullocks and steers Australian Capital Territory 2100
## 4 1976 Oct Bulls, bullocks and steers Australian Capital Territory 1900
## 5 1976 Nov Bulls, bullocks and steers Australian Capital Territory 2100
## 6 1976 Dec Bulls, bullocks and steers Australian Capital Territory 1800
# filter data need it and use ETS function
pig_fit <- aus_livestock %>%
filter(State == "Victoria") %>%
filter(Animal == "Pigs") %>%
model(ETS(Count ~ error("A") + trend("N") + season("N")))
# forecast of pigs for next four months
pig_fc <- pig_fit %>%
forecast(h = 4)
#Graph the forecst
pig_fc %>%
autoplot(aus_livestock) +
ggtitle("The Number of Pigs Slaughtered in Victoria")
# get the report of forecast to find values
report(pig_fit)
## Series: Count
## Model: ETS(A,N,N)
## Smoothing parameters:
## alpha = 0.3221247
##
## Initial states:
## l[0]
## 100646.6
##
## sigma^2: 87480760
##
## AIC AICc BIC
## 13737.10 13737.14 13750.07
Based on the report above, the optical value of α is 0.3221247 and ℓ0 is 100646.6.
# interval produced by R
hilo(pig_fc$Count, 95)
## <hilo[4]>
## [1] [76854.79, 113518.3]95 [75927.17, 114445.9]95 [75042.22, 115330.9]95
## [4] [74194.54, 116178.6]95
The table avobe shows us that for that the first prediction interval using a 95% CI is from 76854.79 to 113518.3.
# interval Calculated with 1.96
resid_std <- sd(augment(pig_fit)$.resid)
sprintf(
"Calculated confidence interval 1.96: [%f, %f]",
pig_fc$.mean[1] - (resid_std * 1.96),
pig_fc$.mean[1] + (resid_std * 1.96)
)
## [1] "Calculated confidence interval 1.96: [76871.012478, 113502.102384]"
Based on the tables above, the 95% prediction interval for the first forecast is between 76871.01 and 113502.1 when computed using the y^±1.96sformula.When we compared both tables, we can see that the intervals are very close to each other.
data("global_economy")
ecua_economy <- global_economy %>%
filter(Country == "Ecuador")
ecua_economy %>%
autoplot(Exports)
I decided to choose my home country to check the data on it, it seems to have ups and downs over the year, the data starts at 1960 and it has huge peaks at 1970, 1900,2000, and the hightest peak at 2010, the biggest decrease in by 2020 I guess because of the pandemic, also another significant decrease in before year 2000.
ecua_export <- ecua_economy %>%
model(ETS(Exports ~ error("A") + trend("N") + season("N")))
ecua_fc <- ecua_export %>%
forecast(h = 10)
ecua_fc %>%
autoplot(ecua_economy) +
labs(title = "Ecuador Annual Exports") +
guides(colour = guide_legend(title = "Forecast"))
ecua_RMSE <- ecua_export %>% accuracy()
ecua_RMSE
## # A tibble: 1 × 11
## Country .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Ecuador "ETS(Exports ~… Trai… 0.212 3.17 2.31 0.216 11.8 0.996 0.990 0.0253
The RMSE value for the training data is 3.166439
ecua_economy %>%
model(
SES = ETS(Exports ~ error("A") + trend("N") + season("N")),
Holt = ETS(Exports ~ error("A") + trend("A") + season("N"))
) %>%
forecast(h = 10) %>%
autoplot(ecua_economy, level = NULL) +
labs(title = "Ecuador Annual Export") +
guides(colour = guide_legend(title = "Type"))
Based on the Graph above, SES method forecasts the very last observation for all forecast periods, in this particular case it is not the best choice since the trend tends to increase. Holt’s method might work better in this scenario, since the trend increases as the years go by.
I can use the same graph above to answer that question, I thonk that Holt’s method is the best option for this scenario, the trend in this data increases and SES method takes the last obvervation for the forecast, so Holt’s might be more accurate.
#R
ecua_fc %>% hilo(95) %>% pull('95%') %>% head(1)
## <hilo[1]>
## [1] [14.42087, 27.05278]95
# interval Calculated with 1.96 resid_std
resid_std <- sd(augment(ecua_export)$.resid)
sprintf(
"Calculated confidence interval 1.96: [%f, %f]",
ecua_fc$.mean[1] - (resid_std * 1.96),
ecua_fc$.mean[1] + (resid_std * 1.96)
)
## [1] "Calculated confidence interval 1.96: [14.490394, 26.983255]"
As we can see the the ranges on the interval calculation(14.490394, 26.983255) are slightly larger than the values with the R calculation (14.42087, 27.05278)
data_china <- global_economy %>%
filter(Country == "China") %>%
features(GDP, features = guerrero) %>%
pull(lambda_guerrero)
china_fit <- global_economy %>%
filter(Country == "China") %>%
model(`Standard` = ETS(GDP ~ error("A") + trend("N") + season("N")),
`Holt's method` = ETS(GDP ~ error("A") + trend("A") + season("N")),
`Damped Holt's method` = ETS(GDP ~ error("A") + trend("Ad", phi = 0.8) + season("N")),
`Box-Cox` = ETS(box_cox(GDP,data_china) ~ error("A") + trend("Ad") + season("N")),
`Damped Box-Cox` = ETS(box_cox(GDP,data_china) ~ error("A") + trend("Ad", phi = 0.8) + season("N")))
china_fc <- china_fit %>%
forecast(h = 44)
china_fc %>%
autoplot(global_economy, level = NULL) +
labs(title="Chinese GDP Forecast with Different Models") +
guides(colour = guide_legend(title = "Forecast"))
Based on the graph above, Box Cox transformation trends the data higher than the rest of model, Holt’s method is the model with the most stady ascending curv, however, I believe that the Damped Box-Cox method is the most recommended model for this data.
data_gas <- aus_production %>%
model(`Additive` = ETS(Gas ~ error("A") + trend("A") + season("A")),
`Multiplicative` = ETS(Gas ~ error("M") + trend("A") + season("M")),
`Damped Multiplicative` = ETS(Gas ~ error("M") + trend("Ad", phi = 0.9) + season("M")))
aus_production %>%
model(`Additive` = ETS(Gas ~ error("A") + trend("A") + season("A")),
`Multiplicative` = ETS(Gas ~ error("M") + trend("A") + season("M")),
`Damped Multiplicative` = ETS(Gas ~ error("M") + trend("Ad", phi = 0.9))) %>%
forecast(h=30) %>%
autoplot(aus_production, level = NULL) +
labs(title="Gas Production Forecast") +
guides(colour = guide_legend(title = "Forecast"))
report(data_gas)
## Warning in report.mdl_df(data_gas): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 3 × 9
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Additive 23.6 -927. 1872. 1873. 1903. 22.7 29.7 3.35
## 2 Multiplicative 0.00324 -831. 1681. 1682. 1711. 21.1 32.2 0.0413
## 3 Damped Multiplicative 0.00340 -835. 1688. 1689. 1719. 21.0 32.4 0.0424
accuracy(data_gas)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Additive Trai… 0.00525 4.76 3.35 -4.69 10.9 0.600 0.628 0.0772
## 2 Multiplicative Trai… -0.115 4.60 3.02 0.199 4.08 0.542 0.606 -0.0131
## 3 Damped Multiplic… Trai… 0.255 4.58 3.04 0.655 4.15 0.545 0.604 -0.00451
Based on the graph and the data analysis above,damped multiplicative method is might be the best option for this dataset, since the data has seasonal variation to the level. If seasonal variation was constant the additive method would be more recommended.Also the RMSE of the damped data is 4.580880 and differs and values from the other models additive is 4.763979 and multiplicative is 4.595113.
set.seed(271279)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries %>%
autoplot(Turnover)
multiplicative seasonality is necessary for this scenario, because the high peaks of the seasonal periods changes yearly.
fit <- myseries |>
model(
Multiplicative = ETS(Turnover ~ error("M") + trend("A") + season("M")),
Damped = ETS(Turnover ~ error("M") + trend("Ad") + season("M"))
)
fc <- fit |> forecast(h = 17)
fc |>
autoplot(myseries, level = NULL) +
labs(title="Turnover Time Series",
y="turnover") +
guides(colour = guide_legend(title = "Forecast"))
fit %>%
accuracy()
## # A tibble: 2 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Western Aus… Clothin… Multi… Trai… 0.134 5.59 4.03 -0.0602 3.95 0.477 0.500
## 2 Western Aus… Clothin… Damped Trai… 0.382 5.63 4.01 0.197 3.90 0.474 0.503
## # ℹ 1 more variable: ACF1 <dbl>
The RMSE for the multiplicative model(5.594874) is lower compared to the Damped model(5.626189) which it makes the preferable model.
fit %>%
select(Damped) %>%
gg_tsresiduals()
The first plot looks like white noise( innovation residuals). However, the histogram show us some normality in the data, as well as the ACF graph.
#set the model for the dates required.
myseries_train <- myseries %>%
filter(year(Month) <= 2010)
myseries_test <- myseries %>%
filter(year(Month) > 2010)
myseries_fit <- myseries_train %>%
model(
Multiplicative = ETS(Turnover ~ error("M") + trend("A") + season("M")),
Damped = ETS(Turnover ~ error("M") + trend("Ad", phi = 0.8) + season("M")),
NAIVE = NAIVE(Turnover)
)
myseries_forecasts <- myseries_fit |>
forecast(h = 12 * 8)
myseries_forecasts |>
autoplot(
myseries,
level = NULL
) +
labs(
title = "Monthly Turnover Forecast"
) +
guides(colour = guide_legend(title = "Forecast"))
accuracy(myseries_forecasts, myseries_test)
## # A tibble: 3 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Damped West… Clothin… Test 2.60 11.0 7.80 0.870 4.59 NaN NaN 0.335
## 2 Multi… West… Clothin… Test -17.6 20.0 17.8 -11.5 11.5 NaN NaN 0.496
## 3 NAIVE West… Clothin… Test -85.4 92.1 89.0 -57.9 59.3 NaN NaN 0.111
It looks like the Multiplicative method has a high RMSE(19.99288), the Damped method RMSE(11.03819) and the SNAIVE RMSE(92.08651) have a huge difference However, I think that the DAMPED method might be able to beat the NAIVE in exercise 7.
mylambda <- myseries_train |>
features(Turnover, guerrero) |>
pull(lambda_guerrero)
mylambda
## [1] 0.08953657
mytrainsqrt <- myseries_train |>
mutate(sqrtTurnover = sqrt(Turnover)) |>
select(sqrtTurnover)
mysqrtfit <- mytrainsqrt |>
model(
stl_sqrt = STL(sqrtTurnover ~ season(window = "periodic"), robust = TRUE),
ets_sqrt = ETS(sqrtTurnover),
holt_mult = ETS(sqrtTurnover ~ error("M") + trend("A") + season("M"))
)
mysqrtfit |>
accuracy()
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 stl_sqrt Training 0.0396 0.287 0.188 0.238 1.97 0.408 0.507 0.172
## 2 ets_sqrt Training -0.00518 0.249 0.190 -0.0866 2.07 0.412 0.439 0.00737
## 3 holt_mult Training -0.00518 0.249 0.190 -0.0866 2.07 0.412 0.439 0.00737
We can see that ets_sqrt and holt_mult have the same RMSE stl_sqrt has a slightly lower values. The magnitude of the error is relatively close across the three methods here. I would say that the difference it’s not significant with my forecast test set(Lambda)