library(fpp3)
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tibble      3.3.0     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.2
## ✔ lubridate   1.9.4     ✔ fable       0.5.0
## ✔ ggplot2     4.0.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(scales)
library(ggrepel)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(patchwork)
library(slider)
  1. Consider the GDP information in global_economy. Plot the GDP per capita for each country over time. Which country has the highest GDP per capita? How has this changed over time?

In this code cell I calculate GDP per capita for each country, and apply separate box-cox and log transformations to GDP per capita values.
I also filtered the dataset to include only values for countries, and excluded aggregated values for developed and undeveloped countries

global_economy
gdp_per_capita <- global_economy |> 
  mutate(Code = as.character(Code)) |> 
  filter(nchar(Code) == 3) |> 
  filter(!is.na(GDP), !is.na(Population)) |> 
  mutate(GDP_per_capita = GDP / Population) |> 
  as_tsibble(key = Country, index = Year)
gdp_per_capita
lambda = BoxCox.lambda(gdp_per_capita$GDP_per_capita, method = 'guerrero')

gdp_per_capita <- gdp_per_capita |> 
  mutate(GDP_boxcox = BoxCox(GDP_per_capita, lambda)) |> 
  mutate(GDP_pc_log = log(GDP_per_capita))
gdp_per_capita

Let’s plot GDP per capita for all countries

gdp_per_capita |> 
  ggplot(aes(x= Year, y = GDP_per_capita, group = Country))+
  geom_line(alpha = 0.15)+
  labs(
    y= 'GDP per capita',
    title = 'Plot of GDP/capita for each country'
  )

The plot above does not show clear picture of GDP per capita series. A lot of countries are compressed at the bottom of the plot.

The better idea is to use Log transformed values for the plot.

gdp_per_capita |> ggplot(aes(x= Year,
                         y = GDP_per_capita,
                         group = Country))+
  geom_line(alpha = 0.15) +
  scale_y_log10(labels = label_number())+
  labs(
    y = 'Log transformed GDP/capita',
    title = "Log transformed GDP/capita time series for each country"
  )

The log transformed plot gives a cleaner view by compressing extreme values and allowing growth patterns to be compared more effectively.

highest_gdp <- gdp_per_capita |> 
  slice_max(GDP_per_capita, n = 1)
  
print(highest_gdp[c('Country', "Year", 'GDP_per_capita')])
## # A tsibble: 1 x 3 [1Y]
## # Key:       Country [1]
##   Country  Year GDP_per_capita
##   <fct>   <dbl>          <dbl>
## 1 Monaco   2014        185153.

Monaco in 2014 had highest GDP per capita of $185153

top_gdp_by_years <- gdp_per_capita |> 
  as_tibble() |> 
  group_by(Year) |> 
  slice_max(GDP_per_capita, n = 1, with_ties = FALSE) |> 
  ungroup() |> 
  as_tsibble(index = Year, key = Country)
top_gdp_by_years
top_gdp_by_years |> 
  ggplot(aes(x = Year,
             y = GDP_per_capita))+
  geom_line()+
  geom_point(aes(colour = Country), size = 1.9, alpha = 0.8)+
  scale_y_log10(labels = label_number())

There is a rising trend in this series with a suggestion of cyclic pattern after 1980s.

In this plot we see United States dominated during 1960-1970s earning highest GDP/capita around the world.
However, Monaco took over in overall dominance over years after 1970 till the end of series. Also, Liechtenstein and Luxembourg had few years og highest GDP per capita values in a recent years.

  1. For each of the following series, make a graph of the data. If transforming seems appropriate, do so and describe the effect.

In general it is better to transform economical data into per capita

us_gdp <- global_economy |> 
  filter(Code == "USA") |> 
  mutate(GDP_per_capita = GDP / Population)

p1 <- us_gdp |> 
  ggplot(aes(x = Year, y = GDP / 1000000)) +
  geom_line(color = "blue") +
  scale_y_continuous(labels = label_number(big.mark = ",")) +
  labs(title = "United States GDP",
       y = "GDP (millions)")

p2 <- us_gdp |> 
  ggplot(aes(x = Year, y = GDP_per_capita)) +
  geom_line(color = "blue") +
  labs(title = "United States GDP per capita",
       y = "GDP per capita")

p1 / p2

Per capita transformation didn’t make much difference. But for further analysis I would use per capita transformed data.

victoria_livestock <- aus_livestock |> 
  filter(Animal == 'Bulls, bullocks and steers') |> 
  filter(State == "Victoria")
  
victoria_livestock |>  autoplot(Count)

The plot reveals that there is a overall slightly decreasing trend, with suggestions of cyclic pattern.

In general mathematical transformations such as Box-Cox or simple Log transformations delivers desired outcome for following requirements for data:

victoria_livestock_rm_rsd <- victoria_livestock |> 
  mutate(
    rolling_mean = slide_dbl(Count, mean, .before = 11, .complete = TRUE),
    rolling_sd = slide_dbl(Count, sd, .before = 11, .complete = TRUE)
  ) |> 
  filter(!is.na(rolling_mean), !is.na(rolling_sd))

ggplot(victoria_livestock_rm_rsd, 
       aes(x = rolling_mean, y = rolling_sd))+
  geom_point()

The rolling mean versus rolling standard deviation plot does not show strong upward linear trend. It apears to be scattered and points are not forming a clear increasing pattern.

From both plots I can say that mathematical transformations probably will not be helpful for this time series.

Let’s apply Box-Cox transformation and see if my findings are correct

lambda_meat <- BoxCox.lambda(victoria_livestock$Count, method = 'guerrero')
lambda_meat
## [1] 0.1615099

Lambda’s value is 0.16 and it close to zero. This transformation will be similar to Log transformation.

Let’s take a look at a plot of Box-Cox transformed time series.

victoria_livestock <-  victoria_livestock |> 
  mutate(Count_bxcx = BoxCox(Count, lambda_meat))
victoria_livestock |> autoplot(Count_bxcx)+
  labs(title = 'Box-Cox transformed time series. Lambda = 0.16',
       y = 'Count (box-cox transformed')

As we see there are not much differences between original data and transformed data plots.

vic_elec
vic_elec |>
  autoplot(Demand) +
  labs(title = 'Victorian Electricity Demand. Time period - 30 minutes')

From this plot I can say:

Let’s apply Box-Cox transformation.

lambda_ve <-  vic_elec |> 
  features(Demand, features = guerrero) |> 
  pull(lambda_guerrero)
lambda_ve
## [1] 0.09993089

Lambda = 0.099 and is almost equals to zero, so it is appropriate just to apply Log transformation

vic_elec |> 
  mutate(demand_bxcx = box_cox(Demand, 0)) |> 
  autoplot(demand_bxcx)+
  scale_y_continuous(labels = label_number())+
  labs(title = 'Log transformed time series',
       y = 'Demand (log transformed')

Log transformation just compressed variation in this data and did not provide better homogeneous result. Therefore transformations are not needed for given time series .

gas_production <- aus_production |> 
  select(c('Quarter', 'Gas'))
#gas_production
gas_production |> autoplot(Gas)+
  labs(title = "Gas production time series",
       y = 'Production Volumes')

The plot reveals folowing insights:

lambda_gas <- gas_production |> 
  features(Gas, features = guerrero) |> 
  pull(lambda_guerrero)

lambda_gas
## [1] 0.1095171

Lambda = 0.1 and log transformation is reasonable for this time series

gas_production <- gas_production |> 
  mutate(Gas_bxcx = box_cox(Gas, lambda_gas)) |> 
  mutate(Gas_log = log(Gas))


gas_production |> autoplot(Gas_log)+
  labs(title = 'Log transformed gas production time series',
       y = 'Production volumes (log transformed)' )

We see that Log transformation helped to stabilize seasonal variation and now transformed time series appear as additive structure.

  1. Why is a Box-Cox transformation unhelpful for the canadian_gas data?
canadian_gas
lambda_canada_gas <- canadian_gas |> 
  features(Volume, features = guerrero) |> 
  pull(lambda_guerrero)
canada_gas <- canadian_gas |> 
#  filter(year(Month)>1990) |> 
  #mutate(Volume = Volume *10000) |> 
  mutate(Volume_bxcx = box_cox(Volume, lambda_canada_gas)) |> 
  mutate(Volume_log = log(Volume)) |> 
  mutate(Volume_inverse = box_cox(Volume, -0.3))
lambda_canada_gas <- canada_gas |> 
  features(Volume, features = guerrero) |> 
  pull(lambda_guerrero)

lambda_canada_gas
## [1] 0.5767648
canada_gas |> 
  autoplot(Volume)+
  labs(title = 'Canadian gas production time series')

The plot reveals that:

canada_gas <- canada_gas |> 
  mutate(
    rolling_mean = slide_dbl(Volume, mean, .before = 11,.complete = TRUE),
    rolling_sd = slide_dbl(Volume, sd, .before = 11, .complete = TRUE)
  )

ggplot(canada_gas, aes(rolling_mean, rolling_sd))+
  geom_point()
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).

Rolling mean vs. rollinng SD plot not showing linear rising trend of rolling means along rolling SD’s.
It suggests that variance is not entirely proportional to the trend of the series.
Box-cox and Log transformations may not provide stable seasonal variance.

canada_gas |> 
  autoplot(Volume_bxcx)+
  labs(title = "Box-cox transformed time series")

Box-cox transformation plot is almost identical to original time series plot. It confirms my findings above

canada_gas |> 
  autoplot(Volume_log)+
  labs(title = 'Log transformed time series')

It appears that log transformation helped to stabilize seasonal variation until 1990s, but not for entire time series.

  1. What Box-Cox transformation would you select for your retail data (from Exercise 7 in Section 2.10)?
aus_retail |> 
  distinct(State)
aus_retail |> 
  distinct(Industry)

For this exercise I will use “Electrical and electronic goods retailing” time series

ind_name <- 'Electrical and electronic goods retailing'

ind_states <- aus_retail |> 
  filter(Industry == ind_name)

industry_aus <- ind_states |> 
  as.tibble() |> 
  group_by(Month) |> 
  summarise(Turnover = sum(Turnover, na.rm = TRUE),
            State = 'Australia',
            Industry = ind_name,
            .groups = 'drop'
            ) |> 
  as_tsibble(key = c(State, Industry), index = Month)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
industry_all <- bind_rows(ind_states, industry_aus) |> 
  filter(State == 'Australia') |> 
  arrange(Month)

industry_all |> 
  autoplot(Turnover)+
  labs(title = "Electrical and electronic goods retailing time series")

There is strong evidence of multiplicative pattern in this time series.
Box-cox transformations can bring stableseasonal variance

lambda_retail <- industry_all |> 
  features(Turnover, features = guerrero) |> 
  pull(lambda_guerrero)

lambda_retail
## [1] -0.02386302

Lambda = -0.02, it will be similar to log transformation

industry_all <- industry_all |> 
  mutate(Turnover_bxcx = box_cox(Turnover, lambda_retail))

industry_all |> 
  autoplot(Turnover_bxcx)

As we see Box-cox transformation helped to stabilize seasonal variance and made data close to homogeneous.

  1. For the following series, find an appropriate Box-Cox transformation in order to stabilise the variance. Tobacco from aus_production, Economy class passengers between Melbourne and Sydney from ansett, and Pedestrian counts at Southern Cross Station from pedestrian.

Tobacco production

aus_production |> 
  autoplot(Tobacco)
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).

From this plot I can say that mathematical transformations may not bring stable variance. It appears the data does not meet requirements for mathematical transformation posted in one of the previous exercises.

lambda_tobacco <- aus_production |> 
  features(Tobacco, features = guerrero) |> 
  pull(lambda_guerrero)

lambda_tobacco
## [1] 0.9264636

Lambda = 0.93. It is very close to value of one, and applying this lambda likely will not change seasonal variance

aus_production |> 
  mutate(Tobacco_bxcx = box_cox(Tobacco, lambda_tobacco)) |> 
  autoplot(Tobacco_bxcx)+
  labs(title = 'Box-cox transformed time series. Lambda = 0.93')
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).

As we see Box-cox transformation did not stabilize the variance.

Economy class passengers time series

ansett
melsyd_econ_passengers <- ansett |> 
  filter(
    Airports == 'MEL-SYD',
    Class == 'Economy'
  ) 
melsyd_econ_passengers|> 
  autoplot(Passengers)+
  labs(title = "Economy class passengers time series")

any(melsyd_econ_passengers$Passengers <= 0)
## [1] TRUE
melsyd_econ_passengers |> 
  filter(Passengers == 0) |> 
  select(Week, Passengers)

It appears that seasonal variance does not appear changing proportionally with trend levels.
There are also several extreme outliers causing irregular fluctuations.

I don’t think mathematical transformations will help to stabilize variance. Probably transformation may help to stabilize aggregated monthly or quarterly periods. Since it is not requirement of exercise i will not produce results here. But I will experiment with that later on my own.

Also, there is a continuous period with zero values occured during pilot’s strike. Log transformation is not applicable unless that period is removed. Another option is to apply log1p transformation

Let’s apply Box-cox transformation

lambda_melsyd <- melsyd_econ_passengers |> 
  features(Passengers, features = guerrero) |> 
  pull(lambda_guerrero)
lambda_melsyd
## [1] 1.999927

Lambda = 2 and it will not help to stabilize variance. It will increase variance at higher levels.

melsyd_econ_passengers <- melsyd_econ_passengers |> 
  mutate(Passengers_bxcx = box_cox(Passengers, lambda_melsyd)) |>
 # filter(Passengers>0) |> 
  mutate(Passengers_log1p = log1p(Passengers))

melsyd_econ_passengers |> autoplot(Passengers_bxcx)+
  labs(title = 'Box-cox transformed time series. Lambda = 2')

As we see Box-cox with lambda = 2 did not bring stable seasonal variance.

Pedestrian counts at Southern Cross Station

pedestrian_station <- pedestrian |> 
  filter(Sensor == 'Southern Cross Station')

pedestrian_station
pedestrian_station |> 
  autoplot(Count)+
  labs(title = "Pedestrian counts at Southern Cross Station time series")

This time series plot is dense and it is difficult to make a decision if transformation is helpfull.
I’ll use Rolling means vs Rolling SD’s plot to see if the variance increase along with the levels of the series

pedestrian_station <-  pedestrian_station |> 
  fill_gaps() |> 
  mutate(
    rolling_mean = slide_dbl(Count, mean, .before = 24*7, na.rm = TRUE, .complete = TRUE),
    rolling_sd = slide_dbl(Count, sd, .before = 24*7, na.rm = TRUE, .complete = TRUE)
  )

ggplot(pedestrian_station,aes(rolling_mean, rolling_sd))+
  geom_point()+
  labs(title = "Rolling mean vs rolling SD's plot")
## Warning: Removed 168 rows containing missing values or values outside the scale range
## (`geom_point()`).

This plot shows as the rolling means increases, the rolling standard deviations also increases. This is a strong evidence of heteroscedasity along with multiplicative structure.

This time series likely have multiple seasonalities:

lambda_pedestrian <- pedestrian_station |> 
  features(Count, features = guerrero) |> 
  pull(lambda_guerrero)

lambda_pedestrian
## [1] -0.2413498

Lambda = -0.24. This is close to log transformation

Let’s apply Log transformation

pedestrian_station <- pedestrian_station |> 
  mutate(Count_bxcx = box_cox(Count, 0))

pedestrian_station |> 
  autoplot(Count_bxcx) +
  labs(title = 'Log transformed time series',
       y = 'Pedestrians count (log transformed)',
       x = "Time period 1 hour")

After transformation we see that :

  1. # Show that a 3×5 MA is equivalent to a 7-term weighted moving average with weights of 0.067, 0.133, 0.200, 0.200, 0.200, 0.133, and 0.067.

We just need 7 positions (x1..x7), but we won’t assign values.

We’ll compute how many times each position appears in the 3 windows.

w <- rep(0, 7)  # weights for x1..x7

Here each 5-term window contributes 1/5 to its included positions

w[1:5] <- w[1:5] + 1/5   # window 1: x1..x5
w[2:6] <- w[2:6] + 1/5   # window 2: x2..x6
w[3:7] <- w[3:7] + 1/5   # window 3: x3..x7

Now we take the 3-term average of the three windows and multiply by 1/3 to get average of those 3 windows

w <- w / 3

w
## [1] 0.06666667 0.13333333 0.20000000 0.20000000 0.20000000 0.13333333 0.06666667
round(w, 3)
## [1] 0.067 0.133 0.200 0.200 0.200 0.133 0.067

It returns weight coefficients for a 7-term weighted moving average with weights of 0.067, 0.133, 0.200, 0.200, 0.200, 0.133, and 0.067

Let’s check if weights sum is equal to 1

names(w) <- paste0("x", 1:7)
w
##         x1         x2         x3         x4         x5         x6         x7 
## 0.06666667 0.13333333 0.20000000 0.20000000 0.20000000 0.13333333 0.06666667
  1. Consider the last five years of the Gas data from aus_production.
gas <- tail(aus_production, 5*4) |> 
  select(Gas)
gas

7a. Plot the time series. Can you identify seasonal fluctuations and/or a trend-cycle?

gas |> 
  autoplot(Gas)

The time series plot reveals following

7b. Use classical_decomposition with type=multiplicative to calculate the trend-cycle and seasonal indices.

gas_indices <- gas |> 
  model(
    classical_decomposition(Gas, type = 'multiplicative')
  ) |> 
  components()
gas_indices |> 
  select(Quarter, trend, seasonal)

7c. Do the results support the graphical interpretation from part a?

We can see overall trendindices are rising year to year and confirms my previous observation. Also there is no evidence of long-turn cycles, we might need to observe more data to find out if there are cycles in gas production.
Seasonal indices show quarterly seasonality:

7d. Compute and plot the seasonally adjusted data.

gas_season_adj <- gas_indices |> 
  mutate(seasonal_adj = Gas/seasonal)

gas_season_adj |> 
  select(Quarter, trend, seasonal, seasonal_adj)
gas_season_adj |> 
  autoplot(seasonal_adj)

We see quarterly fluctuations are removed in seasonally adjusted series. Now it consists of trend and unexplained small remainder components. This confirms rising trend in time series.

7e. Change one observation to be an outlier (e.g., add 300 to one observation), and recompute the seasonally adjusted data. What is the effect of the outlier?

gas_with_outlier <- gas
gas_with_outlier$Gas[6] <- gas_with_outlier$Gas[6] + 300
gas_with_outlier
gas_with_outlier_components <- gas_with_outlier |> 
  model(classical_decomposition(Gas, type = 'multiplicative')
        ) |> 
  components() |> 
  mutate(seasonal_adjust = Gas / seasonal) 
gas_with_outlier_components |> 
  select(Quarter, trend, seasonal, seasonal_adjust)
gas_with_outlier_components |> 
  autoplot(seasonal_adjust)

Classical decomposition applies moving averages to calculate estimated trend and seasonal components, and in the case of quarterly time series it uses 2 x 4 centered MA. Moving averages are sensitive to extreme outliers, and single extreme outlier in the case of quarterly series changes trend levels of 2 periods before and 2 periods after the outlier.
Therefore, both trend-cycle and seasonally adjusted series become locally biased around this extreme outlier.

7f. Does it make any difference if the outlier is near the end rather than in the middle of the time series?

Let’s set the last period of the series as an outlier.

gas_with_end_outlier <- gas
gas_with_end_outlier$Gas[20] <- gas_with_end_outlier$Gas[20] + 300
gas_with_end_outlier
gas_with_end_outlier_components <- gas_with_end_outlier |> 
  model(classical_decomposition(Gas, type = 'multiplicative')
        ) |> 
  components() |> 
  mutate(seasonal_adjust = Gas / seasonal) 
gas_with_end_outlier_components |> 
  select(Quarter, trend, seasonal, seasonal_adjust)
gas_with_end_outlier_components |> 
  autoplot(seasonal_adjust)

As mentioned earlier, classical decomposition uses moving averages and outliers affects the trend locally around the outlier. However, in this case the outlier occurs at the end of the series and CMA cannot be calculated for this particular last period. This results in fewer complete moving averages windows.
Therefore, the distortion may be less symmetric or may affect fewer trend estimates compared to an outlier in the middle of the series.
Same reasoning would apply if an outlier is at first period of the series.
However, even when located at the boundaries, an outlier can still create an illusion of a sudden increase or decrease in the underlying trend, especially in the last few observed periods. This can lead to misleading interpretations about recent changes in the series.
As a result, extreme outliers should always needs to be investigated before applying of classical decomposition, because this method is not robust to outliers and may produce biased trend and seasonal patterns.

  1. Recall your retail time series data (from Exercise 7 in Section 2.10). Decompose the series using X-11. Does it reveal any outliers, or unusual features that you had not noticed previously?
set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries

Time series plot

myseries |> 
  autoplot(Turnover)

The plot reveals following insights.

Classical decomposition

First, let’s apply classical decomposition to compare with followed X11 decomposition method.

myseries |> 
  model(
    classical_decomposition(Turnover, type = 'multiplicative')
  ) |> 
  components() |> 
  autoplot() +
  labs(title = 'Classical decomposition of liquor sales in Australia')
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).

X11 decomposition

myseries_x11_dcmp <- myseries |> 
  model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) |> 
  components()

myseries_x11_dcmp |> 
  autoplot()+
  labs(title = "X11 decomposition of liquor sales in Australia")

Classical and X11 decompositions plots reveals following findings

9. Figures below show the result of decomposing the number of persons in the civilian labour force in Australia each month from February 1978 to August 1995.

Figure 1: Decomposition of the number of persons in the civilian labour force in Australia each month from February 1978 to August 1995.
Figure 1: Decomposition of the number of persons in the civilian labour force in Australia each month from February 1978 to August 1995.
Figure 2: Seasonal component from the decomposition shown in the previous figure.
Figure 2: Seasonal component from the decomposition shown in the previous figure.

a. Write about 3–5 sentences describing the results of the decomposition. Pay particular attention to the scales of the graphs in making your interpretation.

Also, seasonal subseries in Figure 2 reveal that the seasonal effects are not constant over the years.
For example:

Therefore, we cannot assume constant seasonal indexes over the years for this series