library(fpp3)
library(knitr)
library(seasonal)
library(cowplot)
library(pracma)
library(geoR)
Consider the GDP information in global_economy. Plot the GDP per capita for each country over time. Which country has the highest GDP per capita? How has this changed over time?
Since there are so many countries (and non-countries) in the global_economy data set, it is not possible to construct a plot that shows each country’s GDP per capita change over time. So we limit our analysis to the 20 countries that had the highest GDP per capita in 2017.
data(global_economy)
global_economy <- global_economy |>
mutate(GDP_PER_CAPITA = GDP/Population)
top_20_GDP_per_capita_2017 <- global_economy |>
filter(Year == 2017) |>
arrange(desc(GDP_PER_CAPITA)) |>
top_n(20)
global_economy |>
filter(Country %in% top_20_GDP_per_capita_2017$Country) |>
autoplot(GDP_PER_CAPITA, show.legend = FALSE) +
facet_wrap(Country ~ ., ncol = 5) +
labs(title = "Per Capita GDP", y = "$US")
Luxembourg had the highest GDP per capita in 2017 at $104,103.04. The trend is generally upward for all of these countries. San Marino is the exception there. Also, many of the countries, including Luxembourg, the United States, Qatar, Norway, Ireland, and Iceland, experienced a noticeable dip around 2009.
For each of the following series, make a graph of the data. If transforming seems appropriate, do so and describe the effect.
p1 <- global_economy |>
filter(Country == "United States") |>
autoplot(GDP) +
labs(title = "GDP", y = "$US")
p2 <- global_economy |>
filter(Country == "United States") |>
autoplot(GDP_PER_CAPITA) +
labs(title = "Per Capita GDP", y = "$US")
plot_grid(p1, p2)
The same upward trend is visible for the United States no matter whether you look at GDP or per capita GDP. Per capita transformations are always appropriate for GDP data. By doing said transformation, we can interpret the upward trend to mean the GDP growth is not simply due to population growth.
data(aus_livestock)
p3 <- aus_livestock |>
filter(Animal == "Bulls, bullocks and steers" & State == "Victoria") |>
autoplot(Count)
p3
We don’t need to transform this livestock data mathematically or make a calendar/population adjustment. Testing log and Box-Cox transformations didn’t smooth variation.
data(vic_elec)
p4 <- vic_elec |>
autoplot(Demand)
p4
We don’t need to transform this electricity data mathematically or make a calendar/population adjustment. Testing log and Box-Cox transformations didn’t smooth variation.
data(aus_production)
p5 <- aus_production |>
autoplot(Gas)
lambda <- aus_production |>
features(Gas, features = guerrero) |>
pull(lambda_guerrero)
p6 <- aus_production |>
autoplot(box_cox(Gas, lambda))
p5
p6
Performing a Box-Cox transformation on this Australian gas production data smooths variation.
Why is a Box-Cox transformation unhelpful for the canadian_gas data?
data(canadian_gas)
p7 <- canadian_gas |>
autoplot(Volume)
p8 <- canadian_gas |>
ggplot(aes(x = Volume)) +
geom_histogram(binwidth = 5)
lambda <- canadian_gas |>
features(Volume, features = guerrero) |>
pull(lambda_guerrero)
p9 <- canadian_gas |>
autoplot(box_cox(Volume, lambda))
p7
p9
A Box-Cox transformation of the Canadian gas data is unhelpful because the data here are already normally distributed. This becomes clear when we bin the data and generate a histogram on the original data.
p8
What Box-Cox transformation would you select for your retail data (from Exercise 8 in Section 2.10)?
data(aus_retail)
set.seed(1221)
my_series <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`, 1))
p10 <- my_series |>
autoplot(Turnover)
p11 <- my_series |>
autoplot(log(Turnover))
p12 <- my_series |>
autoplot(nthroot(Turnover, n = 4))
p13 <- my_series |>
autoplot(nthroot(Turnover, n = 3))
p14 <- my_series |>
autoplot(sqrt(Turnover))
p15 <- my_series |>
ggplot(aes(x = Turnover)) +
geom_histogram(binwidth = 2.5)
p16 <- my_series |>
ggplot(aes(x = log(Turnover))) +
geom_histogram(binwidth = 0.1)
p17 <- my_series |>
ggplot(aes(x = nthroot(Turnover, n = 4))) +
geom_histogram(binwidth = 0.1)
p18 <- my_series |>
ggplot(aes(x = nthroot(Turnover, n = 3))) +
geom_histogram(binwidth = 0.1)
p19 <- my_series |>
ggplot(aes(x = sqrt(Turnover))) +
geom_histogram(binwidth = 0.1)
plot_grid(p10, p15)
plot_grid(p11, p16)
plot_grid(p12, p17)
plot_grid(p13, p18)
plot_grid(p14, p19)
A fourth-root transformation makes the distribution of my retail data more normal than a log transformation, a square-root transformation, or a cube-root transformation. Let’s confirm if the ideal lambda transformation is something else though.
lambda <- my_series |>
features(Turnover, features = guerrero) |>
pull(lambda_guerrero)
p20 <- my_series |>
autoplot(box_cox(Turnover, lambda))
lambda
## [1] 0.2583737
p20
The lambda returned using the guerrero feature is 0.258, which is not very different from our fourth-root transformation, so we can stick with an easier to understand lambda of 0.25.
For the following series, find an appropriate Box-Cox transformation in order to stabilize the variance. Tobacco from aus_production, Economy class passengers between Melbourne and Sydney from ansett, and Pedestrian counts at Southern Cross Station from pedestrian.
data(ansett)
data(pedestrian)
p21 <- aus_production |>
filter(!is.na(Tobacco)) |>
autoplot(Tobacco)
p22 <- aus_production |>
filter(!is.na(Tobacco)) |>
ggplot(aes(x = Tobacco)) +
geom_histogram(binwidth = 500)
lambda <- aus_production |>
filter(!is.na(Tobacco)) |>
features(Tobacco, features = guerrero) |>
pull(lambda_guerrero)
p23 <- aus_production |>
filter(!is.na(Tobacco)) |>
autoplot(box_cox(Tobacco, lambda))
p21
p22
lambda
## [1] 0.9264636
p23
For the Tobacco data from the aus_production dataset, a Box-Cox transformation is probably unnecessary. The data is already fairly normally distributed, which is why the lambda of 0.92 chosen by the guerrero feature is so close to 1.
ansett_filtered <- ansett |>
filter(Class == "Economy" & Airports == "MEL-SYD")
p24 <- ansett_filtered |>
autoplot(Passengers)
p25 <- ansett_filtered |>
ggplot(aes(x = Passengers)) +
geom_histogram(binwidth = 1000)
lambdas <- boxcoxfit(ansett_filtered$Passengers, lambda2 = TRUE)
lambdas <- lambdas[[1]]
p26 <- ansett_filtered |>
autoplot((((Passengers + lambdas[2])^(lambdas[1] - 1))/lambdas[1]))
p27 <- ansett_filtered |>
ggplot(aes(x = (((Passengers + lambdas[2])^(lambdas[1] - 1))/lambdas[1]))) +
geom_histogram()
p24
p25
lambdas
## lambda lambda2
## 2.997763 16948.778522
p26
p27
For the Economy class passengers between Melbourne and Sydney data from the ansett dataset, we need two lambda values since the data contains observations of 0. This allows a shift prior to transformation. After we shift the data by adding 16,948 to Passengers values, we use a lambda of 3. The data doesn’t look that much more normally distributed, however.
pedestrian_filtered <- pedestrian |>
filter(Sensor == "Southern Cross Station")
p28 <- pedestrian_filtered |>
autoplot(Count)
p29 <- pedestrian_filtered |>
ggplot(aes(x = Count)) +
geom_histogram(binwidth = 250)
lambdas <- boxcoxfit(pedestrian_filtered$Count, lambda2 = TRUE)
lambdas <- lambdas[[1]]
p30 <- pedestrian_filtered |>
autoplot((((Count + lambdas[2])^(lambdas[1] - 1))/lambdas[1]))
p31 <- pedestrian_filtered |>
ggplot(aes(x = (((Count + lambdas[2])^(lambdas[1] - 1))/lambdas[1]))) +
geom_histogram(binwidth = 2.5)
p28
p29
lambdas
## lambda lambda2
## 0.1312404 0.0374300
p30
p31
For the Pedestrian counts at Southern Cross Station data from the pedestrian dataset, we again need two lambda values since the data contains observations of 0. This allows a shift prior to transformation. After we shift the data by adding 0.04 to Count values, we use a lambda of 0.13. The data doesn’t look that much more normally distributed, however.
Consider the last five years of the Gas data from aus_production.
gas <- tail(aus_production, 5*4) |>
select(Gas)
p32 <- gas |>
autoplot(Gas)
p33 <- gas |>
gg_lag(Gas, geom = "point", lags = 1:4)
p34 <- gas |>
gg_season(Gas)
p32
p33
p34
Yes, the seasonal fluctuation is that gas production peaks occur in Q3 and valleys occur in Q1. The general trend is upward: gas production is increasing over time.
p35 <- gas |>
model(classical_decomposition(Gas, type="multiplicative")) |>
components() |>
autoplot()
p35
## Warning: Removed 2 rows containing missing values (`geom_line()`).
Yes, the results from the decomposition support the graphical interpretation from the first part. The general upward trend and the Q3 peak/Q1 valley seasonal pattern are both even easier to see from the decomposition.
x11_dcmp <- gas |>
model(x11 = X_13ARIMA_SEATS(Gas ~ x11())) |>
components()
p36 <- x11_dcmp |>
ggplot(aes(x = Quarter)) +
geom_line(aes(y = season_adjust))
p36
gas_outlier <- gas
gas_outlier[11, 1] <- gas_outlier[11, 1] + 300
x11_dcmp <- gas_outlier |>
model(x11 = X_13ARIMA_SEATS(Gas ~ x11())) |>
components()
p37 <- x11_dcmp |>
ggplot(aes(x = Quarter)) +
geom_line(aes(y = season_adjust))
p37
The outlier flattens the scale of the seasonally adjusted data line, obscuring some of the fluctuation in the data points that surround it. However, the line is ultimately robust to the addition of this outlier. The outlier doesn’t smooth the line much before or after it. If the line weren’t robust, we would see noticeable smoothing.
gas_outlier <- gas
gas_outlier[20, 1] <- gas_outlier[20, 1] + 300
x11_dcmp <- gas_outlier |>
model(x11 = X_13ARIMA_SEATS(Gas ~ x11())) |>
components()
p38 <- x11_dcmp |>
ggplot(aes(x = Quarter)) +
geom_line(aes(y = season_adjust))
p38
Having changed a data point near the end rather than the middle of the data, we don’t see a noticeable effect from the position of this outlier on what happens to the line. The line is still relatively robust to the outlier; we don’t see much smoothing.
Recall your retail time series data (from Exercise 8 in Section 2.10). Decompose the series using X-11. Does it reveal any outliers, or unusual features that you had not noticed previously?
x11_dcmp <- my_series |>
model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) |>
components()
p39 <- x11_dcmp |>
ggplot(aes(x = Month)) +
geom_line(aes(y = Turnover, color = "Data")) +
geom_line(aes(y = season_adjust, color = "Seasonally Adjusted")) +
geom_line(aes(y = trend, color = "Trend")) +
scale_color_brewer(palette = "YlGnBu")
p39
This reveals many months between 2000-2010 where even the seasonally adjusted data peaks were higher than the trend.
Figures 3.19 and 3.20 show the result of decomposing the number of persons in the civilian labour force in Australia each month from February 1978 to August 1995.
Because STL decomposition was used, we see the trend line is generally upward. It is robust to the outliers we will discuss in part two. Seasonally, August has gone from a time of bad decrease to worst decrease. December has gone from a time of good increase to best increase. July is the only month to have gone from a time of mild decrease to mild increase.
The recession of 1991/1992 is visible in the remainder component. It is not visible in the trend line because STL decomposition was used, and it is robust to outliers.