library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.2
## ✔ lubridate 1.9.4 ✔ fable 0.5.0
## ✔ ggplot2 4.0.0
## Warning: package 'fable' was built under R version 4.5.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
In this report, I will be answering questions 1, 2, 3, 4, 5, and 8 from the 2.10 section from the Hyndman online Forecasting book
Explore the following four time series: Bricks from aus_production, Lynx from pelt, Close from gafa_stock, Demand from vic_elec.
# using ? to get information about the 4 series
?aus_production
?pelt
?gafa_stock
?vic_elec
We can see in the documentation that the time intervals for aus_production, pelt, gafa_stock, and vic_elec are quarterly, annual, irregular daily trading days,and half-hourly, respectively.
I also found that we can use function interval() to this, which I will do for Bricks, Lynx, Close, and Demand in the chunk below.
bricks <- aus_production |>
select(Bricks)
bricks
## # A tsibble: 218 x 2 [1Q]
## Bricks Quarter
## <dbl> <qtr>
## 1 189 1956 Q1
## 2 204 1956 Q2
## 3 208 1956 Q3
## 4 197 1956 Q4
## 5 187 1957 Q1
## 6 214 1957 Q2
## 7 227 1957 Q3
## 8 222 1957 Q4
## 9 199 1958 Q1
## 10 229 1958 Q2
## # ℹ 208 more rows
lynx <- pelt |>
select(Lynx)
close <- gafa_stock |>
select(Close)
demand <- vic_elec |>
select(Demand)
bricks |> interval()
## <interval[1]>
## [1] 1Q
lynx |> interval()
## <interval[1]>
## [1] 1Y
close |> interval()
## <interval[1]>
## [1] !
demand |> interval()
## <interval[1]>
## [1] 30m
We can confirm the time intervals here. It is important to note that “!” for close is because the interval is not regular, since trading days include business days only.
# plots
autoplot(bricks)
## Plot variable not specified, automatically selected `.vars = Bricks`
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
autoplot(lynx)
## Plot variable not specified, automatically selected `.vars = Lynx`
autoplot(close)
## Plot variable not specified, automatically selected `.vars = Close`
# plot with title and axis labels modified
autoplot(demand) +
labs(
title = "Victoria's Total Electricity Demand[MWh]",
x = "Time [30min]",
y = "Electricty Demand [MWh]"
)
## Plot variable not specified, automatically selected `.vars = Demand`
Use filter() to find what days corresponded to the peak closing price for each of the four stocks in gafa_stock.
gafa_stock |>
group_by(Symbol) |>
slice_max(Close == max(Close)) |>
select(Symbol, Date, Close)
## # A tsibble: 4 x 3 [!]
## # Key: Symbol [4]
## # Groups: Symbol [4]
## Symbol Date Close
## <chr> <date> <dbl>
## 1 AAPL 2018-10-03 232.
## 2 AMZN 2018-09-04 2040.
## 3 FB 2018-07-25 218.
## 4 GOOG 2018-07-26 1268.
Download the file tute1.csv from the book website, open it in Excel (or some other spreadsheet application), and review its contents. You should find four columns of information. Columns B through D each contain a quarterly series, labelled Sales, AdBudget and GDP. Sales contains the quarterly sales for a small company over the period 1981-2005. AdBudget is the advertising budget and GDP is the gross domestic product. All series have been adjusted for inflation.
tute1 <- readr::read_csv("//Users//joaodeoliveira//Downloads//tute1.csv")
## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Sales, AdBudget, GDP
## date (1): Quarter
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# I commented it out because of a knitting error
#View(tute1)
mytimeseries <- tute1 |>
mutate(Quarter = yearquarter(Quarter)) |>
as_tsibble(index = Quarter)
mytimeseries |>
pivot_longer(-Quarter) |>
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line() +
facet_grid(name ~ ., scales = "free_y")
mytimeseries |>
pivot_longer(-Quarter) |>
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line()
When we don’t include facet_grid() all three plots are plotted together with the same y-axis scale, which greatly impacts our ability to properly understand the data. Without facet_grid(), the GDP plot looks pretty consistent and regular, however, when we look at the plot using facet_grid() we can see that is not the case. The same principle is also true for the AdBudget plot but not at the same level. So, using the proper scale is key to accurately interpret the data.
The USgas package contains data on the demand for natural gas in the US.
# install.packages("USgas")
library(USgas)
state_gas_total <- us_total |>
as_tsibble(index = year, key = state)
state_gas_total |>
filter(state %in% c(
"Maine", "Vermont", "New Hampshire", "Massachusetts", "Connecticut",
"Rhode Island"
)) |>
autoplot(y) +
labs(
title = "New England Natural Gas Consumption",
x = "Year",
y = "Gas consumption [Mft^3]"
)
As we can see in the plot, the series suggests growing regional reliance
on natural gas, mainly in Connecticut. Smaller states display more
variability, likely reflecting differences in population size,
industrial activity, and energy infrastructure.
# part a
tourism_excel <- readxl::read_excel("//Users//joaodeoliveira//Downloads//tourism.xlsx")
glimpse(tourism_excel)
## Rows: 24,320
## Columns: 5
## $ Quarter <chr> "1998-01-01", "1998-04-01", "1998-07-01", "1998-10-01", "1999-…
## $ Region <chr> "Adelaide", "Adelaide", "Adelaide", "Adelaide", "Adelaide", "A…
## $ State <chr> "South Australia", "South Australia", "South Australia", "Sout…
## $ Purpose <chr> "Business", "Business", "Business", "Business", "Business", "B…
## $ Trips <dbl> 135.0777, 109.9873, 166.0347, 127.1605, 137.4485, 199.9126, 16…
# checking tourism from fpp3
glimpse(tourism)
## Rows: 24,320
## Columns: 5
## Key: Region, State, Purpose [304]
## $ Quarter <qtr> 1998 Q1, 1998 Q2, 1998 Q3, 1998 Q4, 1999 Q1, 1999 Q2, 1999 Q3,…
## $ Region <chr> "Adelaide", "Adelaide", "Adelaide", "Adelaide", "Adelaide", "A…
## $ State <chr> "South Australia", "South Australia", "South Australia", "Sout…
## $ Purpose <chr> "Business", "Business", "Business", "Business", "Business", "B…
## $ Trips <dbl> 135.0777, 109.9873, 166.0347, 127.1605, 137.4485, 199.9126, 16…
# part b
tourism_tsibble <- tourism_excel |>
mutate(Quarter = yearquarter(Quarter)) |>
as_tsibble(index = Quarter, key = c("Region", "State", "Purpose"))
# part c
tourism_excel |>
group_by(Region, Purpose) |>
summarize(avg_trips = mean(Trips)) |>
arrange(desc(avg_trips))
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## # A tibble: 304 × 3
## # Groups: Region [76]
## Region Purpose avg_trips
## <chr> <chr> <dbl>
## 1 Sydney Visiting 747.
## 2 Melbourne Visiting 619.
## 3 Sydney Business 602.
## 4 North Coast NSW Holiday 588.
## 5 Sydney Holiday 550.
## 6 Gold Coast Holiday 528.
## 7 Melbourne Holiday 507.
## 8 South Coast Holiday 495.
## 9 Brisbane Visiting 493.
## 10 Melbourne Business 478.
## # ℹ 294 more rows
# part d
state_tsibble <- tourism_tsibble |>
index_by(Quarter) |>
group_by(State) |>
summarize(Trips = sum(Trips)) |>
as_tibble(index = Quarter, key = state)
state_tsibble
## # A tibble: 640 × 3
## State Quarter Trips
## <chr> <qtr> <dbl>
## 1 ACT 1998 Q1 551.
## 2 ACT 1998 Q2 416.
## 3 ACT 1998 Q3 436.
## 4 ACT 1998 Q4 450.
## 5 ACT 1999 Q1 379.
## 6 ACT 1999 Q2 558.
## 7 ACT 1999 Q3 449.
## 8 ACT 1999 Q4 595.
## 9 ACT 2000 Q1 600.
## 10 ACT 2000 Q2 557.
## # ℹ 630 more rows
Use the following graphics functions: autoplot(), gg_season(), gg_subseries(), gg_lag(), ACF() and explore features from the following time series: “Total Private” Employed from us_employment, Bricks from aus_production, Hare from pelt, “H02” Cost from PBS, and Barrels from us_gasoline.
# Total Private employed from us_employment
employment <- us_employment |>
filter(Title == "Total Private")
us_employment
## # A tsibble: 143,412 x 4 [1M]
## # Key: Series_ID [148]
## Month Series_ID Title Employed
## <mth> <chr> <chr> <dbl>
## 1 1939 Jan CEU0500000001 Total Private 25338
## 2 1939 Feb CEU0500000001 Total Private 25447
## 3 1939 Mar CEU0500000001 Total Private 25833
## 4 1939 Apr CEU0500000001 Total Private 25801
## 5 1939 May CEU0500000001 Total Private 26113
## 6 1939 Jun CEU0500000001 Total Private 26485
## 7 1939 Jul CEU0500000001 Total Private 26481
## 8 1939 Aug CEU0500000001 Total Private 26848
## 9 1939 Sep CEU0500000001 Total Private 27468
## 10 1939 Oct CEU0500000001 Total Private 27830
## # ℹ 143,402 more rows
employment |>
autoplot(Employed)
employment |>
gg_season(Employed)
## Warning: `gg_season()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_season()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
employment |>
gg_subseries(Employed)
## Warning: `gg_subseries()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_subseries()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
employment |>
gg_lag(Employed)
## Warning: `gg_lag()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_lag()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
employment |>
ACF(Employed)
## # A tsibble: 29 x 3 [1M]
## # Key: Series_ID [1]
## Series_ID lag acf
## <chr> <cf_lag> <dbl>
## 1 CEU0500000001 1M 0.997
## 2 CEU0500000001 2M 0.993
## 3 CEU0500000001 3M 0.990
## 4 CEU0500000001 4M 0.986
## 5 CEU0500000001 5M 0.983
## 6 CEU0500000001 6M 0.980
## 7 CEU0500000001 7M 0.977
## 8 CEU0500000001 8M 0.974
## 9 CEU0500000001 9M 0.971
## 10 CEU0500000001 10M 0.968
## # ℹ 19 more rows
The us_emplyment series has a strong upward trend (increasing employment along time) with some seasonality (lower levels towards the beginning and end of the year). We can see that in 2000s there was a decline, probably explained by the 2007/2008 financial crisis.
aus_production |>
autoplot(Bricks)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
aus_production |>
gg_season(Bricks)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
aus_production |>
gg_subseries(Bricks)
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_line()`).
aus_production |>
gg_lag(Bricks)
## Warning: Removed 20 rows containing missing values (gg_lag).
aus_production |>
ACF(Bricks)
## # A tsibble: 22 x 2 [1Q]
## lag acf
## <cf_lag> <dbl>
## 1 1Q 0.900
## 2 2Q 0.815
## 3 3Q 0.813
## 4 4Q 0.828
## 5 5Q 0.720
## 6 6Q 0.642
## 7 7Q 0.655
## 8 8Q 0.692
## 9 9Q 0.609
## 10 10Q 0.556
## # ℹ 12 more rows
The australian brick production time series shows a clear seasonality, with much more production in the last 3 quarters of the year when compared with the first quarter. We can also see sharp production declines (mid 1970s, early 1980s, mid 1990s, and early 2000s). It is very important to note that there is not an obvious long term trend, however, the production has been decreasing. The peak was in the 1980s, howver the last collected data is at the 1970s level, which means that the production was probably transferred overseas.
pelt |>
autoplot(Hare)
# pelt |>
# gg_season(Hare)
pelt |>
gg_subseries(Hare)
pelt |>
gg_lag(Hare)
pelt |>
ACF(Hare)
## # A tsibble: 19 x 2 [1Y]
## lag acf
## <cf_lag> <dbl>
## 1 1Y 0.658
## 2 2Y 0.214
## 3 3Y -0.155
## 4 4Y -0.401
## 5 5Y -0.493
## 6 6Y -0.401
## 7 7Y -0.168
## 8 8Y 0.113
## 9 9Y 0.307
## 10 10Y 0.340
## 11 11Y 0.296
## 12 12Y 0.206
## 13 13Y 0.0372
## 14 14Y -0.153
## 15 15Y -0.285
## 16 16Y -0.295
## 17 17Y -0.202
## 18 18Y -0.0676
## 19 19Y 0.0956
I couldn’t use the gg_season(), I assume because the data is annual. We can see that the number of snowshoe Hare pelts traded doesn’t have a trend, however, we can see that there is a cyclical behavior with maximums (max amount of snowshoe hare pelt traded) happen every 10 years (1862, 1874, 1883/4, etc).
water <- PBS |>
filter(ATC2 == "H02")
water |>
autoplot(Cost)
water |>
gg_season(Cost)
water |>
gg_subseries(Cost)
# I commented this one off because I keep get an error saying the there is more
# than one time series. I tried to use another filter but it still didn't work.
# so, I filtered using concession and type (besides the H20 filter)
water |>
filter(Concession == "Concessional", Type == "Co-payments") |>
gg_lag(Cost)
water |>
filter(Concession == "General", Type == "Co-payments") |>
gg_lag(Cost)
water |>
filter(Concession == "Concessional", Type == "Safety net") |>
gg_lag(Cost)
water |>
filter(Concession == "General", Type == "Safety net") |>
gg_lag(Cost)
water |>
ACF(Cost)
## # A tsibble: 92 x 6 [1M]
## # Key: Concession, Type, ATC1, ATC2 [4]
## Concession Type ATC1 ATC2 lag acf
## <chr> <chr> <chr> <chr> <cf_lag> <dbl>
## 1 Concessional Co-payments H H02 1M 0.834
## 2 Concessional Co-payments H H02 2M 0.679
## 3 Concessional Co-payments H H02 3M 0.514
## 4 Concessional Co-payments H H02 4M 0.352
## 5 Concessional Co-payments H H02 5M 0.264
## 6 Concessional Co-payments H H02 6M 0.219
## 7 Concessional Co-payments H H02 7M 0.253
## 8 Concessional Co-payments H H02 8M 0.337
## 9 Concessional Co-payments H H02 9M 0.464
## 10 Concessional Co-payments H H02 10M 0.574
## # ℹ 82 more rows
The monthly medicare Australia prescription data is organized by type of concession and type of payment. Looking at the combined data we can see a positive trend, suggesting a continuous increase in expenditure. There is a monthly seasonality. The gg-season plot is very helpful to understand that there is a different behavior along the year when comparing different types (co-payments vs safety net), with safety net cost being higher at the beggining and end of the year, on the contrary of the co-payments with higher cost mid-year (suggesting that after the end of the summer people probably reached the $290 threshold).
us_gasoline |>
autoplot(Barrels)
us_gasoline |>
gg_season(Barrels)
us_gasoline |>
gg_subseries(Barrels)
us_gasoline |>
gg_lag(Barrels)
us_gasoline |>
ACF(Barrels)
## # A tsibble: 31 x 2 [1W]
## lag acf
## <cf_lag> <dbl>
## 1 1W 0.893
## 2 2W 0.882
## 3 3W 0.873
## 4 4W 0.866
## 5 5W 0.847
## 6 6W 0.844
## 7 7W 0.832
## 8 8W 0.831
## 9 9W 0.822
## 10 10W 0.808
## # ℹ 21 more rows
US gasoline barrels has a strong seasonality with higher values mid-year (summer) when compared with the rest of the year. The production has been continuously increasing every year with some exceptions like between the 2007 financial crisis and 2014. The data distribution seems volatile, with a lot of variation.