library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.5
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.3.2
## ✔ lubridate 1.9.3 ✔ fable 0.3.4
## ✔ ggplot2 3.5.0 ✔ fabletools 0.4.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
data("global_economy")
head(global_economy)
## # A tsibble: 6 x 9 [1Y]
## # Key: Country [1]
## Country Code Year GDP Growth CPI Imports Exports Population
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan AFG 1960 537777811. NA NA 7.02 4.13 8996351
## 2 Afghanistan AFG 1961 548888896. NA NA 8.10 4.45 9166764
## 3 Afghanistan AFG 1962 546666678. NA NA 9.35 4.88 9345868
## 4 Afghanistan AFG 1963 751111191. NA NA 16.9 9.17 9533954
## 5 Afghanistan AFG 1964 800000044. NA NA 18.1 8.89 9731361
## 6 Afghanistan AFG 1965 1006666638. NA NA 21.4 11.3 9938414
#the data will need some transformations, first we'll need to calculate gdp per captia by dividing gdp/population, then only keep the necessary columns for the time series
global_economy %>%
autoplot(GDP / Population, show.legend = FALSE) +
labs(title= "GDP per capita", y = "USD")
## Warning: Removed 3242 rows containing missing values or values outside the scale range
## (`geom_line()`).
#Due to the number of countries in the data and plotting them all on one chart, it's difficult to see what country has the highest GDP per capital. So we'll check the data by sorting and then examine the single country's GDP per capital over time.
global_highest <- global_economy %>%
group_by(Country, GDP, Population) %>%
summarise(GDP_per_capita = GDP / Population) %>%
arrange(desc(GDP_per_capita))
global_highest
## # A tsibble: 15,150 x 5 [1Y]
## # Key: Country, GDP, Population [15,149]
## # Groups: Country, GDP [11,965]
## Country GDP Population Year GDP_per_capita
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Monaco 7060236168. 38132 2014 185153.
## 2 Monaco 6476490406. 35853 2008 180640.
## 3 Liechtenstein 6657170923. 37127 2014 179308.
## 4 Liechtenstein 6391735894. 36834 2013 173528.
## 5 Monaco 6553372278. 37971 2013 172589.
## 6 Monaco 6468252212. 38499 2016 168011.
## 7 Liechtenstein 6268391521. 37403 2015 167591.
## 8 Monaco 5867916781. 35111 2007 167125.
## 9 Liechtenstein 6214633651. 37666 2016 164993.
## 10 Monaco 6258178995. 38307 2015 163369.
## # ℹ 15,140 more rows
#Monaco has the highest GDPPC.
global_economy %>%
filter(Country == "Monaco") %>%
autoplot(GDP/Population) +
labs(title= "GDP per capita for Monaco", y = "USD")
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Over time we can see that Monaco's GDP has overall steadily increased over the years, with a dip and then jump occuring in two places (1985 and 2002). Despite the decreases it still has a higher GDPPC than other countries in the dataset.
# United States GDP from global_economy.
global_economy %>%
filter(Country == "United States") %>%
autoplot(GDP/ 10^12) +
labs(title= "GDP per capita for the US", y = "USD (trillions)")
#The only transformation was to adjust the y axis labels to trillions to make it more readable.
#Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock.
data("aus_livestock")
aus_livestock %>%
filter(Animal == "Bulls, bullocks and steers", State == "Victoria") %>%
autoplot(Count) +
labs(title = "Slaughter of Victorian Bulls, bullocks and steers", y = "# Slaughtered")
#No transformation was needed here as readability is fine, I left month and year together as it is easy to read.
#Victorian Electricity Demand from vic_elec.
data("vic_elec")
v <- vic_elec %>%
group_by(Date) %>%
mutate(Demand = sum(Demand)) %>%
distinct(Date, Demand)
v %>%
as_tsibble(index = Date) %>%
autoplot(Demand) +
labs(title= "Daily Victorian Electricity Demand", y = "$US (in trillions)")
# We transformed the data to reflect daily vs half hourly electricity demands, this is easier to read and we can see the variation in days better (we can see a bigger picture)
# Gas production from aus_production.
aus_production %>%
autoplot(Gas) +
labs(title = "Non-Transformed Gas Production")
lambda <- aus_production %>%
features(Gas, features = guerrero) %>%
pull(lambda_guerrero)
aus_production %>%
autoplot(box_cox(Gas, lambda)) +
labs(y = "",
title = latex2exp::TeX(paste0(
"Transformed gas production with $\\lambda$ = ",
round(lambda,2))))
#A box_cox transformation is more useful, we can see a new pattern of in 1970 Q1 there was a steep increase in gas production.
canadian_gas %>%
autoplot(Volume) +
labs(title = "Non-Transformed Gas Production")
lambda <- canadian_gas %>%
features(Volume, features = guerrero) %>%
pull(lambda_guerrero)
canadian_gas %>%
autoplot(box_cox(Volume, lambda)) +
labs(y = "", title = latex2exp::TeX(paste0(
"Transformed gas production with $\\lambda$ = ",
round(lambda,2))))
# The Box-Cox transformation is not helpful here because the transformation did not make the seasonal variation uniform. According to the Box-Cox transformation section in the textbook: good value of λ is one which makes the size of the seasonal variation about the same across the whole series, as that makes the forecasting model simpler. The Guerrero method chose a lambda for us (of 0.58) but even with this, the variation isn't uniform.
set.seed(1234)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
autoplot(myseries, Turnover)+
labs(title = "Retail Turnover", y = "$AUD (in millions)")
lambda <- myseries %>%
features(Turnover, features = guerrero) %>%
pull(lambda_guerrero)
myseries %>%
autoplot(box_cox(Turnover, lambda)) +
labs(y = "", title = latex2exp::TeX(paste0(
"Transformed retail turnover with $\\lambda$ = ",
round(lambda,2))))
#For this specific seed, the Guerrero feature selected a lambda of 0.32 in order to make the variance stable.
autoplot(aus_production, Tobacco)+
labs(title = "Tobacco and Cigarette Production")
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).
lambda <- aus_production %>%
features(Tobacco, features = guerrero) %>%
pull(lambda_guerrero)
aus_production %>%
autoplot(box_cox(Tobacco, lambda)) +
labs(y = "", title = latex2exp::TeX(paste0("Transformed Tobacco Production with $\\lambda$ = ",
round(lambda,2))))
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).
#Lambda here is almost 1, causing the data to transform by shifting down the y axis. The overall shape of the graph however, did not change. De to this, the Box-Cox transformation was not very effective in this instance.
mel_syd <- ansett %>%
filter(Class == "Economy",
Airports == "MEL-SYD")
autoplot(mel_syd, Passengers)+
labs(title = "Economy class Passengers Between Melbourne and Sydney")
lambda <- mel_syd %>%
features(Passengers, features = guerrero) %>%
pull(lambda_guerrero)
mel_syd %>%
autoplot(box_cox(Passengers, lambda)) +
labs(y = "", title = latex2exp::TeX(paste0("Transformed Number of Passengers with $\\lambda$ = ",
round(lambda,2))))
# Here the formula gave s a lambda of 2, and we are able to see the variation in more detail as essentially the trasformation was to the power of two (pssengers^2).
#Due to southern_cross being at an hourly level and the potential for their visuals to look cluttered, let's look at the data on weekly level as we'll be able to see a better overall picture.
data("pedestrian")
southern_cross <- pedestrian %>%
filter(Sensor == "Southern Cross Station")
southern_cross <- southern_cross %>%
mutate(Week = yearweek(Date)) %>%
index_by(Week) %>%
summarise(Count = sum(Count))
autoplot(southern_cross, Count)+
labs(title = "Weekly Pedestrian Counts at Southern Cross Station")
lambda <- southern_cross %>%
features(Count, features = guerrero) %>%
pull(lambda_guerrero)
southern_cross %>%
autoplot(box_cox(Count, lambda)) +
labs(y = "", title = latex2exp::TeX(paste0("Transformed Weekly Pedestrian Counts with $\\lambda$ = ",
round(lambda,2))))
#In this instance, -0.11 was chosen as our lambda. The Box-Cox transformation doesn't seem to make the data more uniform, so I don't believe it was effective here.
gas <- tail(aus_production, 5*4) %>% select(Gas)
autoplot(gas, Gas)
# There is a positive trend here, and the cycle is every 1 year and the cycle is that there's an increse after Q1 and peaks right before Q3, then decreases all the way back to Q1.
gas_decomp <- gas %>%
model(classical_decomposition(Gas, type = "multiplicative"))
components(gas_decomp) %>%
autoplot() +
labs(title = "Classical Multiplicative Decomposition of Gas Production")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
components(gas_decomp) %>%
as_tsibble() %>%
autoplot(Gas) +
geom_line(aes(y=season_adjust), linetype = "dashed") +
labs(title = "Gas Production (Seasonally Adjusted)")
gas %>%
mutate(Gas = ifelse(Gas == 249, Gas + 300, Gas)) %>%
model(classical_decomposition(Gas, type = "multiplicative")) %>%
components() %>%
as_tsibble() %>%
autoplot(Gas) +
geom_line(aes(y=season_adjust), linetype = "dashed") +
labs(title = "Gas Production (Seasonally Adjusted + outlier)")
#With the outlier, Q3 2008 had 400 added to it. This caused a big increase in both the seasonally adjusted and normal data, however the seasonally adjusted data was slightly less affected.
###Does it make any difference if the outlier is near the end rather than in the middle of the time series?
gas %>%
mutate(Gas = ifelse(Gas == 236, Gas + 300, Gas)) %>%
model(classical_decomposition(Gas, type = "multiplicative")) %>%
components() %>%
as_tsibble() %>%
autoplot(Gas) +
geom_line(aes(y=season_adjust), linetype = "dashed") +
labs(title = "Gas Production (Seasonally Adjusted + outlier)")
#The placement of the outlier still caused the data to have a spike, one thing to note is that the seasonally adjusted data's spike has increased compared to when the outlier was in the middle.
x11_decomp <- myseries %>%
model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) %>%
components()
autoplot(x11_decomp) +
labs(title = "Decomposition of Retail Turnover (X-11)")
#Using the X-11 decomposition method, we see the line is less smooth and is spikier. The method captured seasonality better and is displaying more irregularities that were not seen in the initial curve.